aboutsummaryrefslogtreecommitdiff
path: root/REORG.TODO/sysdeps/powerpc/powerpc64/power7
diff options
context:
space:
mode:
Diffstat (limited to 'REORG.TODO/sysdeps/powerpc/powerpc64/power7')
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/Implies2
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/Makefile11
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/add_n.S98
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/bcopy.c1
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/Implies1
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/multiarch/Implies1
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S70
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S1
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S69
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S1
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S68
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S1
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logb.c1
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logbf.c1
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logbl.c1
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/memchr.S199
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/memcmp.S1061
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/memcpy.S430
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/memmove.S835
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/mempcpy.S472
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/memrchr.S201
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/memset.S399
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/multiarch/Implies1
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/rawmemchr.S115
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/stpncpy.S24
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcasecmp.S126
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S5
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/strchr.S230
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/strchrnul.S131
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcmp.S168
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/strlen.S107
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncmp.S227
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncpy.S722
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/strnlen.S182
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/strrchr.S260
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/strstr-ppc64.c27
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/strstr.S521
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power7/sub_n.S23
38 files changed, 6793 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/Implies
new file mode 100644
index 0000000000..9d68f39d22
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc64/power6/fpu
+powerpc/powerpc64/power6
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/Makefile b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/Makefile
new file mode 100644
index 0000000000..89a2296085
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/Makefile
@@ -0,0 +1,11 @@
+ifeq ($(subdir),elf)
+# Prevent the use of VSX registers and insns in _dl_start, which under -O3
+# optimization may require a TOC reference before relocations are resolved.
+CFLAGS-rtld.c += -mno-vsx
+endif
+
+ifeq ($(subdir),string)
+sysdep_routines += strstr-ppc64
+CFLAGS-strncase.c += -funroll-loops
+CFLAGS-strncase_l.c += -funroll-loops
+endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/add_n.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/add_n.S
new file mode 100644
index 0000000000..6425afbc9f
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/add_n.S
@@ -0,0 +1,98 @@
+/* PowerPC64 mpn_lshift -- mpn_add_n/mpn_sub_n -- mpn addition and
+ subtraction.
+ Copyright (C) 2003-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* cycles/limb
+ * POWER7 2.18
+ */
+
+#ifdef USE_AS_SUB
+# define FUNC __mpn_sub_n
+# define ADDSUBC subfe
+#else
+# define FUNC __mpn_add_n
+# define ADDSUBC adde
+#endif
+
+#define RP r3
+#define UP r4
+#define VP r5
+#define N r6
+
+EALIGN(FUNC, 5, 0)
+#ifdef USE_AS_SUB
+ addic r0, r0, 0
+#else
+ addic r0, r1, -1
+#endif
+ andi. r7, N, 1
+ beq L(bx0)
+
+ ld r7, 0(UP)
+ ld r9, r0(VP)
+ ADDSUBC r11, r9, r7
+ std r11, r0(RP)
+ cmpldi N, N, 1
+ beq N, L(end)
+ addi UP, UP, 8
+ addi VP, VP, 8
+ addi RP, RP, 8
+
+L(bx0): addi r0, N, 2
+ srdi r0, r0, 2
+ mtctr r0
+
+ andi. r7, N, 2
+ bne L(mid)
+
+ addi UP, UP, 16
+ addi VP, VP, 16
+ addi RP, RP, 16
+
+ .align 5
+L(top): ld r6, -16(UP)
+ ld r7, -8(UP)
+ ld r8, -16(VP)
+ ld r9, -8(VP)
+ ADDSUBC r10, r8, N
+ ADDSUBC r11, r9, r7
+ std r10, -16(RP)
+ std r11, -8(RP)
+L(mid): ld r6, 0(UP)
+ ld r7, 8(UP)
+ ld r8, 0(VP)
+ ld r9, 8(VP)
+ ADDSUBC r10, r8, N
+ ADDSUBC r11, r9, r7
+ std r10, 0(RP)
+ std r11, 8(RP)
+ addi UP, UP, 32
+ addi VP, VP, 32
+ addi RP, RP, 32
+ bdnz L(top)
+
+L(end): subfe r3, r0, r0
+#ifdef USE_AS_SUB
+ neg r3, r3
+#else
+ addi r3, r3, 1
+#endif
+ blr
+END(FUNC)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/bcopy.c b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/bcopy.c
new file mode 100644
index 0000000000..4a6a400e7a
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/bcopy.c
@@ -0,0 +1 @@
+/* Implemented at memmove.S */
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/Implies
new file mode 100644
index 0000000000..30fa17646e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power6/fpu
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/multiarch/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/multiarch/Implies
new file mode 100644
index 0000000000..410d289a6d
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/multiarch/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power6/fpu/multiarch
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
new file mode 100644
index 0000000000..9ccc758c9e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
@@ -0,0 +1,70 @@
+/* finite(). PowerPC64/POWER7 version.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __finite(x) */
+ .section ".toc","aw"
+.LC0: /* 1.0 */
+ .tc FD_ONE[TC],0x3ff0000000000000
+ .section ".text"
+ .type __finite, @function
+ .machine power7
+EALIGN (__finite, 4, 0)
+ CALL_MCOUNT 0
+ lfd fp0,.LC0@toc(r2)
+ ftdiv cr7,fp1,fp0
+ li r3,1
+ bflr 30
+
+ /* If we are here, we either have +/-INF,
+ NaN or denormal. */
+
+ stfd fp1,-16(r1) /* Transfer FP to GPR's. */
+ ori 2,2,0 /* Force a new dispatch group. */
+ lhz r4,-16+HISHORT(r1) /* Fetch the upper 16 bits of the FP value
+ (biased exponent and sign bit). */
+ clrlwi r4,r4,17 /* r4 = abs(r4). */
+ cmpwi cr7,r4,0x7ff0 /* r4 == 0x7ff0? */
+ bltlr cr7 /* LT means finite, other non-finite. */
+ li r3,0
+ blr
+ END (__finite)
+
+hidden_def (__finite)
+weak_alias (__finite, finite)
+
+/* It turns out that the 'double' version will also always work for
+ single-precision. */
+strong_alias (__finite, __finitef)
+hidden_def (__finitef)
+weak_alias (__finitef, finitef)
+
+#if IS_IN (libm)
+# if LONG_DOUBLE_COMPAT (libm, GLIBC_2_0)
+compat_symbol (libm, __finite, __finitel, GLIBC_2_0)
+compat_symbol (libm, finite, finitel, GLIBC_2_0)
+# endif
+#else
+# if LONG_DOUBLE_COMPAT (libc, GLIBC_2_0)
+compat_symbol (libc, __finite, __finitel, GLIBC_2_0);
+compat_symbol (libc, finite, finitel, GLIBC_2_0);
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S
new file mode 100644
index 0000000000..54bd94176d
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_finite.S. */
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
new file mode 100644
index 0000000000..4482cddcfa
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
@@ -0,0 +1,69 @@
+/* isinf(). PowerPC64/POWER7 version.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __isinf(x) */
+ .section ".toc","aw"
+.LC0: /* 1.0 */
+ .tc FD_ONE[TC],0x3ff0000000000000
+ .section ".text"
+ .type __isinf, @function
+ .machine power7
+EALIGN (__isinf, 4, 0)
+ CALL_MCOUNT 0
+ lfd fp0,.LC0@toc(r2)
+ ftdiv cr7,fp1,fp0
+ li r3,0
+ bflr 29 /* If not INF, return. */
+
+ /* Either we have -INF/+INF or a denormal. */
+
+ stfd fp1,-16(r1) /* Transfer FP to GPR's. */
+ ori 2,2,0 /* Force a new dispatch group. */
+ lhz r4,-16+HISHORT(r1) /* Fetch the upper 16 bits of the FP value
+ (biased exponent and sign bit). */
+ cmpwi cr7,r4,0x7ff0 /* r4 == 0x7ff0? */
+ li r3,1
+ beqlr cr7 /* EQ means INF, otherwise -INF. */
+ li r3,-1
+ blr
+ END (__isinf)
+
+hidden_def (__isinf)
+weak_alias (__isinf, isinf)
+
+/* It turns out that the 'double' version will also always work for
+ single-precision. */
+strong_alias (__isinf, __isinff)
+hidden_def (__isinff)
+weak_alias (__isinff, isinff)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isinf, __isinfl)
+weak_alias (__isinf, isinfl)
+#endif
+
+#if !IS_IN (libm)
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0);
+compat_symbol (libc, isinf, isinfl, GLIBC_2_0);
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S
new file mode 100644
index 0000000000..be759e091e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isinf.S. */
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S
new file mode 100644
index 0000000000..46b08a0d37
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S
@@ -0,0 +1,68 @@
+/* isnan(). PowerPC64/POWER7 version.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __isnan(x) */
+ .section ".toc","aw"
+.LC0: /* 1.0 */
+ .tc FD_ONE[TC],0x3ff0000000000000
+ .section ".text"
+ .type __isnan, @function
+ .machine power7
+EALIGN (__isnan, 4, 0)
+ CALL_MCOUNT 0
+ lfd fp0,.LC0@toc(r2)
+ ftdiv cr7,fp1,fp0
+ li r3,0
+ bflr 30 /* If not NaN, finish. */
+
+ stfd fp1,-16(r1) /* Transfer FP to GPR's. */
+ ori 2,2,0 /* Force a new dispatch group. */
+ ld r4,-16(r1) /* Load FP into GPR. */
+ lis r0,0x7ff0
+ sldi r0,r0,32 /* const long r0 0x7ff00000 00000000. */
+ clrldi r4,r4,1 /* x = fabs(x) */
+ cmpd cr7,r4,r0 /* if (fabs(x) <= inf) */
+ blelr cr7 /* LE means not NaN. */
+ li r3,1 /* else return 1 */
+ blr
+ END (__isnan)
+
+hidden_def (__isnan)
+weak_alias (__isnan, isnan)
+
+/* It turns out that the 'double' version will also always work for
+ single-precision. */
+strong_alias (__isnan, __isnanf)
+hidden_def (__isnanf)
+weak_alias (__isnanf, isnanf)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isnan, __isnanl)
+weak_alias (__isnan, isnanl)
+#endif
+
+#if !IS_IN (libm)
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0);
+compat_symbol (libc, isnan, isnanl, GLIBC_2_0);
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S
new file mode 100644
index 0000000000..b48c85e0d3
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isnan.S. */
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logb.c b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logb.c
new file mode 100644
index 0000000000..2599c771d9
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logb.c
@@ -0,0 +1 @@
+#include <sysdeps/powerpc/power7/fpu/s_logb.c>
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logbf.c b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logbf.c
new file mode 100644
index 0000000000..7a5a8032e0
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logbf.c
@@ -0,0 +1 @@
+#include <sysdeps/powerpc/power7/fpu/s_logbf.c>
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logbl.c b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logbl.c
new file mode 100644
index 0000000000..524ae2c78d
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logbl.c
@@ -0,0 +1 @@
+#include <sysdeps/powerpc/power7/fpu/s_logbl.c>
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memchr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memchr.S
new file mode 100644
index 0000000000..5e9707aa02
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memchr.S
@@ -0,0 +1,199 @@
+/* Optimized memchr implementation for PowerPC64/POWER7 using cmpb insn.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* int [r3] memchr (char *s [r3], int byte [r4], int size [r5]) */
+
+#ifndef MEMCHR
+# define MEMCHR __memchr
+#endif
+ .machine power7
+ENTRY (MEMCHR)
+ CALL_MCOUNT 3
+ dcbt 0,r3
+ clrrdi r8,r3,3
+ insrdi r4,r4,8,48
+
+ /* Calculate the last acceptable address and check for possible
+ addition overflow by using satured math:
+ r7 = r3 + r5
+ r7 |= -(r7 < x) */
+ add r7,r3,r5
+ subfc r6,r3,r7
+ subfe r9,r9,r9
+ extsw r6,r9
+ or r7,r7,r6
+
+ insrdi r4,r4,16,32
+ cmpldi r5,32
+ li r9, -1
+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
+ insrdi r4,r4,32,0
+ addi r7,r7,-1
+#ifdef __LITTLE_ENDIAN__
+ sld r9,r9,r6
+#else
+ srd r9,r9,r6
+#endif
+ ble L(small_range)
+
+ ld r12,0(r8) /* Load doubleword from memory. */
+ cmpb r3,r12,r4 /* Check for BYTEs in DWORD1. */
+ and r3,r3,r9
+ clrldi r5,r7,61 /* Byte count - 1 in last dword. */
+ clrrdi r7,r7,3 /* Address of last doubleword. */
+ cmpldi cr7,r3,0 /* Does r3 indicate we got a hit? */
+ bne cr7,L(done)
+
+ mtcrf 0x01,r8
+ /* Are we now aligned to a quadword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+ bt 28,L(loop_setup)
+
+ /* Handle DWORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r3,r12,r4
+ cmpldi cr7,r3,0
+ bne cr7,L(done)
+
+L(loop_setup):
+ /* The last dword we want to read in the loop below is the one
+ containing the last byte of the string, ie. the dword at
+ (s + size - 1) & ~7, or r7. The first dword read is at
+ r8 + 8, we read 2 * cnt dwords, so the last dword read will
+ be at r8 + 8 + 16 * cnt - 8. Solving for cnt gives
+ cnt = (r7 - r8) / 16 */
+ sub r6,r7,r8
+ srdi r6,r6,4 /* Number of loop iterations. */
+ mtctr r6 /* Setup the counter. */
+
+ /* Main loop to look for BYTE in the string. Since
+ it's a small loop (8 instructions), align it to 32-bytes. */
+ .align 5
+L(loop):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the byte-checking process for bigger strings. */
+ ld r12,8(r8)
+ ldu r11,16(r8)
+ cmpb r3,r12,r4
+ cmpb r9,r11,r4
+ or r6,r9,r3 /* Merge everything in one doubleword. */
+ cmpldi cr7,r6,0
+ bne cr7,L(found)
+ bdnz L(loop)
+
+ /* We may have one more dword to read. */
+ cmpld r8,r7
+ beqlr
+
+ ldu r12,8(r8)
+ cmpb r3,r12,r4
+ cmpldi cr6,r3,0
+ bne cr6,L(done)
+ blr
+
+ .align 4
+L(found):
+ /* OK, one (or both) of the doublewords contains BYTE. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains BYTE. */
+ cmpldi cr6,r3,0
+ addi r8,r8,-8
+ bne cr6,L(done)
+
+ /* BYTE must be in the second doubleword. Adjust the address
+ again and move the result of cmpb to r3 so we can calculate the
+ pointer. */
+
+ mr r3,r9
+ addi r8,r8,8
+
+ /* r3 has the output of the cmpb instruction, that is, it contains
+ 0xff in the same position as BYTE in the original
+ doubleword from the string. Use that to calculate the pointer.
+ We need to make sure BYTE is *before* the end of the range. */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+ addi r0,r3,-1
+ andc r0,r0,r3
+ popcntd r0,r0 /* Count trailing zeros. */
+#else
+ cntlzd r0,r3 /* Count leading zeros before the match. */
+#endif
+ cmpld r8,r7 /* Are we on the last dword? */
+ srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
+ add r3,r8,r0
+ cmpld cr7,r0,r5 /* If on the last dword, check byte offset. */
+ bnelr
+ blelr cr7
+ li r3,0
+ blr
+
+ .align 4
+L(null):
+ li r3,0
+ blr
+
+/* Deals with size <= 32. */
+ .align 4
+L(small_range):
+ cmpldi r5,0
+ beq L(null)
+ ld r12,0(r8) /* Load word from memory. */
+ cmpb r3,r12,r4 /* Check for BYTE in DWORD1. */
+ and r3,r3,r9
+ cmpldi cr7,r3,0
+ clrldi r5,r7,61 /* Byte count - 1 in last dword. */
+ clrrdi r7,r7,3 /* Address of last doubleword. */
+ cmpld r8,r7 /* Are we done already? */
+ bne cr7,L(done)
+ beqlr
+
+ ldu r12,8(r8)
+ cmpb r3,r12,r4
+ cmpldi cr6,r3,0
+ cmpld r8,r7
+ bne cr6,L(done) /* Found something. */
+ beqlr /* Hit end of string (length). */
+
+ ldu r12,8(r8)
+ cmpb r3,r12,r4
+ cmpldi cr6,r3,0
+ cmpld r8,r7
+ bne cr6,L(done)
+ beqlr
+
+ ldu r12,8(r8)
+ cmpb r3,r12,r4
+ cmpldi cr6,r3,0
+ cmpld r8,r7
+ bne cr6,L(done)
+ beqlr
+
+ ldu r12,8(r8)
+ cmpb r3,r12,r4
+ cmpldi cr6,r3,0
+ bne cr6,L(done)
+ blr
+
+END (MEMCHR)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memcmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memcmp.S
new file mode 100644
index 0000000000..96ce8cee25
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memcmp.S
@@ -0,0 +1,1061 @@
+/* Optimized memcmp implementation for POWER7/PowerPC64.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* int [r3] memcmp (const char *s1 [r3],
+ const char *s2 [r4],
+ size_t size [r5]) */
+#ifndef MEMCMP
+# define MEMCMP memcmp
+#endif
+ .machine power7
+EALIGN (MEMCMP, 4, 0)
+ CALL_MCOUNT 3
+
+#define rRTN r3
+#define rSTR1 r3 /* first string arg */
+#define rSTR2 r4 /* second string arg */
+#define rN r5 /* max string length */
+#define rWORD1 r6 /* current word in s1 */
+#define rWORD2 r7 /* current word in s2 */
+#define rWORD3 r8 /* next word in s1 */
+#define rWORD4 r9 /* next word in s2 */
+#define rWORD5 r10 /* next word in s1 */
+#define rWORD6 r11 /* next word in s2 */
+
+#define rOFF8 r20 /* 8 bytes offset. */
+#define rOFF16 r21 /* 16 bytes offset. */
+#define rOFF24 r22 /* 24 bytes offset. */
+#define rOFF32 r23 /* 24 bytes offset. */
+#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
+#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
+#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
+#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
+#define rSHR r28 /* Unaligned shift right count. */
+#define rSHL r29 /* Unaligned shift left count. */
+#define rWORD7 r30 /* next word in s1 */
+#define rWORD8 r31 /* next word in s2 */
+
+#define rWORD8SAVE (-8)
+#define rWORD7SAVE (-16)
+#define rOFF8SAVE (-24)
+#define rOFF16SAVE (-32)
+#define rOFF24SAVE (-40)
+#define rOFF32SAVE (-48)
+#define rSHRSAVE (-56)
+#define rSHLSAVE (-64)
+#define rWORD8SHIFTSAVE (-72)
+#define rWORD2SHIFTSAVE (-80)
+#define rWORD4SHIFTSAVE (-88)
+#define rWORD6SHIFTSAVE (-96)
+
+#ifdef __LITTLE_ENDIAN__
+# define LD ldbrx
+#else
+# define LD ldx
+#endif
+
+ xor r0, rSTR2, rSTR1
+ cmpldi cr6, rN, 0
+ cmpldi cr1, rN, 12
+ clrldi. r0, r0, 61
+ clrldi r12, rSTR1, 61
+ cmpldi cr5, r12, 0
+ beq- cr6, L(zeroLength)
+ dcbt 0, rSTR1
+ dcbt 0, rSTR2
+/* If less than 8 bytes or not aligned, use the unaligned
+ byte loop. */
+ blt cr1, L(bytealigned)
+ std rWORD8, rWORD8SAVE(r1)
+ std rWORD7, rWORD7SAVE(r1)
+ std rOFF8, rOFF8SAVE(r1)
+ std rOFF16, rOFF16SAVE(r1)
+ std rOFF24, rOFF24SAVE(r1)
+ std rOFF32, rOFF32SAVE(r1)
+ cfi_offset(rWORD8, rWORD8SAVE)
+ cfi_offset(rWORD7, rWORD7SAVE)
+ cfi_offset(rOFF8, rOFF8SAVE)
+ cfi_offset(rOFF16, rOFF16SAVE)
+ cfi_offset(rOFF24, rOFF24SAVE)
+ cfi_offset(rOFF32, rOFF32SAVE)
+
+ li rOFF8,8
+ li rOFF16,16
+ li rOFF24,24
+ li rOFF32,32
+
+ bne L(unaligned)
+/* At this point we know both strings have the same alignment and the
+ compare length is at least 8 bytes. r12 contains the low order
+ 3 bits of rSTR1 and cr5 contains the result of the logical compare
+ of r12 to 0. If r12 == 0 then we are already double word
+ aligned and can perform the DW aligned loop.
+
+ Otherwise we know the two strings have the same alignment (but not
+ yet DW). So we force the string addresses to the next lower DW
+ boundary and special case this first DW using shift left to
+ eliminate bits preceding the first byte. Since we want to join the
+ normal (DW aligned) compare loop, starting at the second double word,
+ we need to adjust the length (rN) and special case the loop
+ versioning for the first DW. This ensures that the loop count is
+ correct and the first DW (shifted) is in the expected register pair. */
+ .align 4
+L(samealignment):
+ clrrdi rSTR1, rSTR1, 3
+ clrrdi rSTR2, rSTR2, 3
+ beq cr5, L(DWaligned)
+ add rN, rN, r12
+ sldi rWORD6, r12, 3
+ srdi r0, rN, 5 /* Divide by 32 */
+ andi. r12, rN, 24 /* Get the DW remainder */
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
+ cmpldi cr1, r12, 16
+ cmpldi cr7, rN, 32
+ clrldi rN, rN, 61
+ beq L(dPs4)
+ mtctr r0
+ bgt cr1, L(dPs3)
+ beq cr1, L(dPs2)
+
+/* Remainder is 8 */
+ .align 3
+L(dsP1):
+ sld rWORD5, rWORD1, rWORD6
+ sld rWORD6, rWORD2, rWORD6
+ cmpld cr5, rWORD5, rWORD6
+ blt cr7, L(dP1x)
+/* Do something useful in this cycle since we have to branch anyway. */
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+ b L(dP1e)
+/* Remainder is 16 */
+ .align 4
+L(dPs2):
+ sld rWORD5, rWORD1, rWORD6
+ sld rWORD6, rWORD2, rWORD6
+ cmpld cr6, rWORD5, rWORD6
+ blt cr7, L(dP2x)
+/* Do something useful in this cycle since we have to branch anyway. */
+ LD rWORD7, rOFF8, rSTR1
+ LD rWORD8, rOFF8, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+ b L(dP2e)
+/* Remainder is 24 */
+ .align 4
+L(dPs3):
+ sld rWORD3, rWORD1, rWORD6
+ sld rWORD4, rWORD2, rWORD6
+ cmpld cr1, rWORD3, rWORD4
+ b L(dP3e)
+/* Count is a multiple of 32, remainder is 0 */
+ .align 4
+L(dPs4):
+ mtctr r0
+ sld rWORD1, rWORD1, rWORD6
+ sld rWORD2, rWORD2, rWORD6
+ cmpld cr7, rWORD1, rWORD2
+ b L(dP4e)
+
+/* At this point we know both strings are double word aligned and the
+ compare length is at least 8 bytes. */
+ .align 4
+L(DWaligned):
+ andi. r12, rN, 24 /* Get the DW remainder */
+ srdi r0, rN, 5 /* Divide by 32 */
+ cmpldi cr1, r12, 16
+ cmpldi cr7, rN, 32
+ clrldi rN, rN, 61
+ beq L(dP4)
+ bgt cr1, L(dP3)
+ beq cr1, L(dP2)
+
+/* Remainder is 8 */
+ .align 4
+L(dP1):
+ mtctr r0
+/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
+ (8-15 byte compare), we want to use only volatile registers. This
+ means we can avoid restoring non-volatile registers since we did not
+ change any on the early exit path. The key here is the non-early
+ exit path only cares about the condition code (cr5), not about which
+ register pair was used. */
+ LD rWORD5, 0, rSTR1
+ LD rWORD6, 0, rSTR2
+ cmpld cr5, rWORD5, rWORD6
+ blt cr7, L(dP1x)
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+L(dP1e):
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ bne cr5, L(dLcr5x)
+ bne cr7, L(dLcr7x)
+
+ LD rWORD7, rOFF32, rSTR1
+ LD rWORD8, rOFF32, rSTR2
+ addi rSTR1, rSTR1, 32
+ addi rSTR2, rSTR2, 32
+ bne cr1, L(dLcr1)
+ cmpld cr5, rWORD7, rWORD8
+ bdnz L(dLoop)
+ bne cr6, L(dLcr6)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+ .align 3
+L(dP1x):
+ sldi. r12, rN, 3
+ bne cr5, L(dLcr5x)
+ subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
+ bne L(d00)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 0
+ blr
+
+/* Remainder is 16 */
+ .align 4
+L(dP2):
+ mtctr r0
+ LD rWORD5, 0, rSTR1
+ LD rWORD6, 0, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ blt cr7, L(dP2x)
+ LD rWORD7, rOFF8, rSTR1
+ LD rWORD8, rOFF8, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+L(dP2e):
+ LD rWORD1, rOFF16, rSTR1
+ LD rWORD2, rOFF16, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+ LD rWORD3, rOFF24, rSTR1
+ LD rWORD4, rOFF24, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ addi rSTR1, rSTR1, 8
+ addi rSTR2, rSTR2, 8
+ bne cr6, L(dLcr6)
+ bne cr5, L(dLcr5)
+ b L(dLoop2)
+ .align 4
+L(dP2x):
+ LD rWORD3, rOFF8, rSTR1
+ LD rWORD4, rOFF8, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ sldi. r12, rN, 3
+ bne cr6, L(dLcr6x)
+ addi rSTR1, rSTR1, 8
+ addi rSTR2, rSTR2, 8
+ bne cr1, L(dLcr1x)
+ subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
+ bne L(d00)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 0
+ blr
+
+/* Remainder is 24 */
+ .align 4
+L(dP3):
+ mtctr r0
+ LD rWORD3, 0, rSTR1
+ LD rWORD4, 0, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+L(dP3e):
+ LD rWORD5, rOFF8, rSTR1
+ LD rWORD6, rOFF8, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ blt cr7, L(dP3x)
+ LD rWORD7, rOFF16, rSTR1
+ LD rWORD8, rOFF16, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+ LD rWORD1, rOFF24, rSTR1
+ LD rWORD2, rOFF24, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+ addi rSTR1, rSTR1, 16
+ addi rSTR2, rSTR2, 16
+ bne cr1, L(dLcr1)
+ bne cr6, L(dLcr6)
+ b L(dLoop1)
+/* Again we are on a early exit path (24-31 byte compare), we want to
+ only use volatile registers and avoid restoring non-volatile
+ registers. */
+ .align 4
+L(dP3x):
+ LD rWORD1, rOFF16, rSTR1
+ LD rWORD2, rOFF16, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+ sldi. r12, rN, 3
+ bne cr1, L(dLcr1x)
+ addi rSTR1, rSTR1, 16
+ addi rSTR2, rSTR2, 16
+ bne cr6, L(dLcr6x)
+ subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
+ bne cr7, L(dLcr7x)
+ bne L(d00)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 0
+ blr
+
+/* Count is a multiple of 32, remainder is 0 */
+ .align 4
+L(dP4):
+ mtctr r0
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+L(dP4e):
+ LD rWORD3, rOFF8, rSTR1
+ LD rWORD4, rOFF8, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ LD rWORD5, rOFF16, rSTR1
+ LD rWORD6, rOFF16, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ LD rWORD7, rOFF24, rSTR1
+ LD rWORD8, rOFF24, rSTR2
+ addi rSTR1, rSTR1, 24
+ addi rSTR2, rSTR2, 24
+ cmpld cr5, rWORD7, rWORD8
+ bne cr7, L(dLcr7)
+ bne cr1, L(dLcr1)
+ bdz- L(d24) /* Adjust CTR as we start with +4 */
+/* This is the primary loop */
+ .align 4
+L(dLoop):
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ bne cr6, L(dLcr6)
+L(dLoop1):
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ bne cr5, L(dLcr5)
+L(dLoop2):
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+ bne cr7, L(dLcr7)
+L(dLoop3):
+ LD rWORD7, rOFF32, rSTR1
+ LD rWORD8, rOFF32, rSTR2
+ addi rSTR1, rSTR1, 32
+ addi rSTR2, rSTR2, 32
+ bne cr1, L(dLcr1)
+ cmpld cr7, rWORD1, rWORD2
+ bdnz L(dLoop)
+
+L(dL4):
+ cmpld cr1, rWORD3, rWORD4
+ bne cr6, L(dLcr6)
+ cmpld cr6, rWORD5, rWORD6
+ bne cr5, L(dLcr5)
+ cmpld cr5, rWORD7, rWORD8
+L(d44):
+ bne cr7, L(dLcr7)
+L(d34):
+ bne cr1, L(dLcr1)
+L(d24):
+ bne cr6, L(dLcr6)
+L(d14):
+ sldi. r12, rN, 3
+ bne cr5, L(dLcr5)
+L(d04):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+ subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
+ beq L(duzeroLength)
+/* At this point we have a remainder of 1 to 7 bytes to compare. Since
+ we are aligned it is safe to load the whole double word, and use
+ shift right double to eliminate bits beyond the compare length. */
+L(d00):
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
+ srd rWORD1, rWORD1, rN
+ srd rWORD2, rWORD2, rN
+ cmpld cr7, rWORD1, rWORD2
+ bne cr7, L(dLcr7x)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 0
+ blr
+
+ .align 4
+L(dLcr7):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+L(dLcr7x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 1
+ bgtlr cr7
+ li rRTN, -1
+ blr
+ .align 4
+L(dLcr1):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+L(dLcr1x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 1
+ bgtlr cr1
+ li rRTN, -1
+ blr
+ .align 4
+L(dLcr6):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+L(dLcr6x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 1
+ bgtlr cr6
+ li rRTN, -1
+ blr
+ .align 4
+L(dLcr5):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+L(dLcr5x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 1
+ bgtlr cr5
+ li rRTN, -1
+ blr
+
+ .align 4
+L(bytealigned):
+ mtctr rN
+
+/* We need to prime this loop. This loop is swing modulo scheduled
+ to avoid pipe delays. The dependent instruction latencies (load to
+ compare to conditional branch) is 2 to 3 cycles. In this loop each
+ dispatch group ends in a branch and takes 1 cycle. Effectively
+ the first iteration of the loop only serves to load operands and
+ branches based on compares are delayed until the next loop.
+
+ So we must precondition some registers and condition codes so that
+ we don't exit the loop early on the first iteration. */
+
+ lbz rWORD1, 0(rSTR1)
+ lbz rWORD2, 0(rSTR2)
+ bdz L(b11)
+ cmpld cr7, rWORD1, rWORD2
+ lbz rWORD3, 1(rSTR1)
+ lbz rWORD4, 1(rSTR2)
+ bdz L(b12)
+ cmpld cr1, rWORD3, rWORD4
+ lbzu rWORD5, 2(rSTR1)
+ lbzu rWORD6, 2(rSTR2)
+ bdz L(b13)
+ .align 4
+L(bLoop):
+ lbzu rWORD1, 1(rSTR1)
+ lbzu rWORD2, 1(rSTR2)
+ bne cr7, L(bLcr7)
+
+ cmpld cr6, rWORD5, rWORD6
+ bdz L(b3i)
+
+ lbzu rWORD3, 1(rSTR1)
+ lbzu rWORD4, 1(rSTR2)
+ bne cr1, L(bLcr1)
+
+ cmpld cr7, rWORD1, rWORD2
+ bdz L(b2i)
+
+ lbzu rWORD5, 1(rSTR1)
+ lbzu rWORD6, 1(rSTR2)
+ bne cr6, L(bLcr6)
+
+ cmpld cr1, rWORD3, rWORD4
+ bdnz L(bLoop)
+
+/* We speculatively loading bytes before we have tested the previous
+ bytes. But we must avoid overrunning the length (in the ctr) to
+ prevent these speculative loads from causing a segfault. In this
+ case the loop will exit early (before the all pending bytes are
+ tested. In this case we must complete the pending operations
+ before returning. */
+L(b1i):
+ bne cr7, L(bLcr7)
+ bne cr1, L(bLcr1)
+ b L(bx56)
+ .align 4
+L(b2i):
+ bne cr6, L(bLcr6)
+ bne cr7, L(bLcr7)
+ b L(bx34)
+ .align 4
+L(b3i):
+ bne cr1, L(bLcr1)
+ bne cr6, L(bLcr6)
+ b L(bx12)
+ .align 4
+L(bLcr7):
+ li rRTN, 1
+ bgtlr cr7
+ li rRTN, -1
+ blr
+L(bLcr1):
+ li rRTN, 1
+ bgtlr cr1
+ li rRTN, -1
+ blr
+L(bLcr6):
+ li rRTN, 1
+ bgtlr cr6
+ li rRTN, -1
+ blr
+
+L(b13):
+ bne cr7, L(bx12)
+ bne cr1, L(bx34)
+L(bx56):
+ sub rRTN, rWORD5, rWORD6
+ blr
+ nop
+L(b12):
+ bne cr7, L(bx12)
+L(bx34):
+ sub rRTN, rWORD3, rWORD4
+ blr
+L(b11):
+L(bx12):
+ sub rRTN, rWORD1, rWORD2
+ blr
+
+ .align 4
+L(zeroLength):
+ li rRTN, 0
+ blr
+
+ .align 4
+/* At this point we know the strings have different alignment and the
+ compare length is at least 8 bytes. r12 contains the low order
+ 3 bits of rSTR1 and cr5 contains the result of the logical compare
+ of r12 to 0. If r12 == 0 then rStr1 is double word
+ aligned and can perform the DWunaligned loop.
+
+ Otherwise we know that rSTR1 is not already DW aligned yet.
+ So we can force the string addresses to the next lower DW
+ boundary and special case this first DW using shift left to
+ eliminate bits preceding the first byte. Since we want to join the
+ normal (DWaligned) compare loop, starting at the second double word,
+ we need to adjust the length (rN) and special case the loop
+ versioning for the first DW. This ensures that the loop count is
+ correct and the first DW (shifted) is in the expected resister pair. */
+L(unaligned):
+ std rSHL, rSHLSAVE(r1)
+ cfi_offset(rSHL, rSHLSAVE)
+ clrldi rSHL, rSTR2, 61
+ beq cr6, L(duzeroLength)
+ std rSHR, rSHRSAVE(r1)
+ cfi_offset(rSHR, rSHRSAVE)
+ beq cr5, L(DWunaligned)
+ std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+ cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
+/* Adjust the logical start of rSTR2 to compensate for the extra bits
+ in the 1st rSTR1 DW. */
+ sub rWORD8_SHIFT, rSTR2, r12
+/* But do not attempt to address the DW before that DW that contains
+ the actual start of rSTR2. */
+ clrrdi rSTR2, rSTR2, 3
+ std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+/* Compute the left/right shift counts for the unaligned rSTR2,
+ compensating for the logical (DW aligned) start of rSTR1. */
+ clrldi rSHL, rWORD8_SHIFT, 61
+ clrrdi rSTR1, rSTR1, 3
+ std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+ sldi rSHL, rSHL, 3
+ cmpld cr5, rWORD8_SHIFT, rSTR2
+ add rN, rN, r12
+ sldi rWORD6, r12, 3
+ std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
+ cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
+ cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
+ subfic rSHR, rSHL, 64
+ srdi r0, rN, 5 /* Divide by 32 */
+ andi. r12, rN, 24 /* Get the DW remainder */
+/* We normally need to load 2 DWs to start the unaligned rSTR2, but in
+ this special case those bits may be discarded anyway. Also we
+ must avoid loading a DW where none of the bits are part of rSTR2 as
+ this may cross a page boundary and cause a page fault. */
+ li rWORD8, 0
+ blt cr5, L(dus0)
+ LD rWORD8, 0, rSTR2
+ addi rSTR2, rSTR2, 8
+ sld rWORD8, rWORD8, rSHL
+
+L(dus0):
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
+ cmpldi cr1, r12, 16
+ cmpldi cr7, rN, 32
+ srd r12, rWORD2, rSHR
+ clrldi rN, rN, 61
+ beq L(duPs4)
+ mtctr r0
+ or rWORD8, r12, rWORD8
+ bgt cr1, L(duPs3)
+ beq cr1, L(duPs2)
+
+/* Remainder is 8 */
+ .align 4
+L(dusP1):
+ sld rWORD8_SHIFT, rWORD2, rSHL
+ sld rWORD7, rWORD1, rWORD6
+ sld rWORD8, rWORD8, rWORD6
+ bge cr7, L(duP1e)
+/* At this point we exit early with the first double word compare
+ complete and remainder of 0 to 7 bytes. See L(du14) for details on
+ how we handle the remaining bytes. */
+ cmpld cr5, rWORD7, rWORD8
+ sldi. rN, rN, 3
+ bne cr5, L(duLcr5)
+ cmpld cr7, rN, rSHR
+ beq L(duZeroReturn)
+ li r0, 0
+ ble cr7, L(dutrim)
+ LD rWORD2, rOFF8, rSTR2
+ srd r0, rWORD2, rSHR
+ b L(dutrim)
+/* Remainder is 16 */
+ .align 4
+L(duPs2):
+ sld rWORD6_SHIFT, rWORD2, rSHL
+ sld rWORD5, rWORD1, rWORD6
+ sld rWORD6, rWORD8, rWORD6
+ b L(duP2e)
+/* Remainder is 24 */
+ .align 4
+L(duPs3):
+ sld rWORD4_SHIFT, rWORD2, rSHL
+ sld rWORD3, rWORD1, rWORD6
+ sld rWORD4, rWORD8, rWORD6
+ b L(duP3e)
+/* Count is a multiple of 32, remainder is 0 */
+ .align 4
+L(duPs4):
+ mtctr r0
+ or rWORD8, r12, rWORD8
+ sld rWORD2_SHIFT, rWORD2, rSHL
+ sld rWORD1, rWORD1, rWORD6
+ sld rWORD2, rWORD8, rWORD6
+ b L(duP4e)
+
+/* At this point we know rSTR1 is double word aligned and the
+ compare length is at least 8 bytes. */
+ .align 4
+L(DWunaligned):
+ std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+ clrrdi rSTR2, rSTR2, 3
+ std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+ srdi r0, rN, 5 /* Divide by 32 */
+ std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+ andi. r12, rN, 24 /* Get the DW remainder */
+ std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
+ cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
+ cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
+ cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
+ sldi rSHL, rSHL, 3
+ LD rWORD6, 0, rSTR2
+ LD rWORD8, rOFF8, rSTR2
+ addi rSTR2, rSTR2, 8
+ cmpldi cr1, r12, 16
+ cmpldi cr7, rN, 32
+ clrldi rN, rN, 61
+ subfic rSHR, rSHL, 64
+ sld rWORD6_SHIFT, rWORD6, rSHL
+ beq L(duP4)
+ mtctr r0
+ bgt cr1, L(duP3)
+ beq cr1, L(duP2)
+
+/* Remainder is 8 */
+ .align 4
+L(duP1):
+ srd r12, rWORD8, rSHR
+ LD rWORD7, 0, rSTR1
+ sld rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
+ blt cr7, L(duP1x)
+L(duP1e):
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+ srd r0, rWORD2, rSHR
+ sld rWORD2_SHIFT, rWORD2, rSHL
+ or rWORD2, r0, rWORD8_SHIFT
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+ srd r12, rWORD4, rSHR
+ sld rWORD4_SHIFT, rWORD4, rSHL
+ bne cr5, L(duLcr5)
+ or rWORD4, r12, rWORD2_SHIFT
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ srd r0, rWORD6, rSHR
+ sld rWORD6_SHIFT, rWORD6, rSHL
+ bne cr7, L(duLcr7)
+ or rWORD6, r0, rWORD4_SHIFT
+ cmpld cr6, rWORD5, rWORD6
+ b L(duLoop3)
+ .align 4
+/* At this point we exit early with the first double word compare
+ complete and remainder of 0 to 7 bytes. See L(du14) for details on
+ how we handle the remaining bytes. */
+L(duP1x):
+ cmpld cr5, rWORD7, rWORD8
+ sldi. rN, rN, 3
+ bne cr5, L(duLcr5)
+ cmpld cr7, rN, rSHR
+ beq L(duZeroReturn)
+ li r0, 0
+ ble cr7, L(dutrim)
+ LD rWORD2, rOFF8, rSTR2
+ srd r0, rWORD2, rSHR
+ b L(dutrim)
+/* Remainder is 16 */
+ .align 4
+L(duP2):
+ srd r0, rWORD8, rSHR
+ LD rWORD5, 0, rSTR1
+ or rWORD6, r0, rWORD6_SHIFT
+ sld rWORD6_SHIFT, rWORD8, rSHL
+L(duP2e):
+ LD rWORD7, rOFF8, rSTR1
+ LD rWORD8, rOFF8, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ srd r12, rWORD8, rSHR
+ sld rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
+ blt cr7, L(duP2x)
+ LD rWORD1, rOFF16, rSTR1
+ LD rWORD2, rOFF16, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+ bne cr6, L(duLcr6)
+ srd r0, rWORD2, rSHR
+ sld rWORD2_SHIFT, rWORD2, rSHL
+ or rWORD2, r0, rWORD8_SHIFT
+ LD rWORD3, rOFF24, rSTR1
+ LD rWORD4, rOFF24, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+ bne cr5, L(duLcr5)
+ srd r12, rWORD4, rSHR
+ sld rWORD4_SHIFT, rWORD4, rSHL
+ or rWORD4, r12, rWORD2_SHIFT
+ addi rSTR1, rSTR1, 8
+ addi rSTR2, rSTR2, 8
+ cmpld cr1, rWORD3, rWORD4
+ b L(duLoop2)
+ .align 4
+L(duP2x):
+ cmpld cr5, rWORD7, rWORD8
+ addi rSTR1, rSTR1, 8
+ addi rSTR2, rSTR2, 8
+ bne cr6, L(duLcr6)
+ sldi. rN, rN, 3
+ bne cr5, L(duLcr5)
+ cmpld cr7, rN, rSHR
+ beq L(duZeroReturn)
+ li r0, 0
+ ble cr7, L(dutrim)
+ LD rWORD2, rOFF8, rSTR2
+ srd r0, rWORD2, rSHR
+ b L(dutrim)
+
+/* Remainder is 24 */
+ .align 4
+L(duP3):
+ srd r12, rWORD8, rSHR
+ LD rWORD3, 0, rSTR1
+ sld rWORD4_SHIFT, rWORD8, rSHL
+ or rWORD4, r12, rWORD6_SHIFT
+L(duP3e):
+ LD rWORD5, rOFF8, rSTR1
+ LD rWORD6, rOFF8, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ srd r0, rWORD6, rSHR
+ sld rWORD6_SHIFT, rWORD6, rSHL
+ or rWORD6, r0, rWORD4_SHIFT
+ LD rWORD7, rOFF16, rSTR1
+ LD rWORD8, rOFF16, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ bne cr1, L(duLcr1)
+ srd r12, rWORD8, rSHR
+ sld rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
+ blt cr7, L(duP3x)
+ LD rWORD1, rOFF24, rSTR1
+ LD rWORD2, rOFF24, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+ bne cr6, L(duLcr6)
+ srd r0, rWORD2, rSHR
+ sld rWORD2_SHIFT, rWORD2, rSHL
+ or rWORD2, r0, rWORD8_SHIFT
+ addi rSTR1, rSTR1, 16
+ addi rSTR2, rSTR2, 16
+ cmpld cr7, rWORD1, rWORD2
+ b L(duLoop1)
+ .align 4
+L(duP3x):
+ addi rSTR1, rSTR1, 16
+ addi rSTR2, rSTR2, 16
+ cmpld cr5, rWORD7, rWORD8
+ bne cr6, L(duLcr6)
+ sldi. rN, rN, 3
+ bne cr5, L(duLcr5)
+ cmpld cr7, rN, rSHR
+ beq L(duZeroReturn)
+ li r0, 0
+ ble cr7, L(dutrim)
+ LD rWORD2, rOFF8, rSTR2
+ srd r0, rWORD2, rSHR
+ b L(dutrim)
+
+/* Count is a multiple of 32, remainder is 0 */
+ .align 4
+L(duP4):
+ mtctr r0
+ srd r0, rWORD8, rSHR
+ LD rWORD1, 0, rSTR1
+ sld rWORD2_SHIFT, rWORD8, rSHL
+ or rWORD2, r0, rWORD6_SHIFT
+L(duP4e):
+ LD rWORD3, rOFF8, rSTR1
+ LD rWORD4, rOFF8, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+ srd r12, rWORD4, rSHR
+ sld rWORD4_SHIFT, rWORD4, rSHL
+ or rWORD4, r12, rWORD2_SHIFT
+ LD rWORD5, rOFF16, rSTR1
+ LD rWORD6, rOFF16, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ bne cr7, L(duLcr7)
+ srd r0, rWORD6, rSHR
+ sld rWORD6_SHIFT, rWORD6, rSHL
+ or rWORD6, r0, rWORD4_SHIFT
+ LD rWORD7, rOFF24, rSTR1
+ LD rWORD8, rOFF24, rSTR2
+ addi rSTR1, rSTR1, 24
+ addi rSTR2, rSTR2, 24
+ cmpld cr6, rWORD5, rWORD6
+ bne cr1, L(duLcr1)
+ srd r12, rWORD8, rSHR
+ sld rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
+ cmpld cr5, rWORD7, rWORD8
+ bdz L(du24) /* Adjust CTR as we start with +4 */
+/* This is the primary loop */
+ .align 4
+L(duLoop):
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ bne cr6, L(duLcr6)
+ srd r0, rWORD2, rSHR
+ sld rWORD2_SHIFT, rWORD2, rSHL
+ or rWORD2, r0, rWORD8_SHIFT
+L(duLoop1):
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ bne cr5, L(duLcr5)
+ srd r12, rWORD4, rSHR
+ sld rWORD4_SHIFT, rWORD4, rSHL
+ or rWORD4, r12, rWORD2_SHIFT
+L(duLoop2):
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+ bne cr7, L(duLcr7)
+ srd r0, rWORD6, rSHR
+ sld rWORD6_SHIFT, rWORD6, rSHL
+ or rWORD6, r0, rWORD4_SHIFT
+L(duLoop3):
+ LD rWORD7, rOFF32, rSTR1
+ LD rWORD8, rOFF32, rSTR2
+ addi rSTR1, rSTR1, 32
+ addi rSTR2, rSTR2, 32
+ cmpld cr7, rWORD1, rWORD2
+ bne cr1, L(duLcr1)
+ srd r12, rWORD8, rSHR
+ sld rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
+ bdnz L(duLoop)
+
+L(duL4):
+ cmpld cr1, rWORD3, rWORD4
+ bne cr6, L(duLcr6)
+ cmpld cr6, rWORD5, rWORD6
+ bne cr5, L(duLcr5)
+ cmpld cr5, rWORD7, rWORD8
+L(du44):
+ bne cr7, L(duLcr7)
+L(du34):
+ bne cr1, L(duLcr1)
+L(du24):
+ bne cr6, L(duLcr6)
+L(du14):
+ sldi. rN, rN, 3
+ bne cr5, L(duLcr5)
+/* At this point we have a remainder of 1 to 7 bytes to compare. We use
+ shift right double to eliminate bits beyond the compare length.
+
+ However it may not be safe to load rWORD2 which may be beyond the
+ string length. So we compare the bit length of the remainder to
+ the right shift count (rSHR). If the bit count is less than or equal
+ we do not need to load rWORD2 (all significant bits are already in
+ rWORD8_SHIFT). */
+ cmpld cr7, rN, rSHR
+ beq L(duZeroReturn)
+ li r0, 0
+ ble cr7, L(dutrim)
+ LD rWORD2, rOFF8, rSTR2
+ srd r0, rWORD2, rSHR
+ .align 4
+L(dutrim):
+ LD rWORD1, rOFF8, rSTR1
+ ld rWORD8, -8(r1)
+ subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
+ or rWORD2, r0, rWORD8_SHIFT
+ ld rWORD7, rWORD7SAVE(r1)
+ ld rSHL, rSHLSAVE(r1)
+ srd rWORD1, rWORD1, rN
+ srd rWORD2, rWORD2, rN
+ ld rSHR, rSHRSAVE(r1)
+ ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+ li rRTN, 0
+ cmpld cr7, rWORD1, rWORD2
+ ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+ ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+ beq cr7, L(dureturn24)
+ li rRTN, 1
+ ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ bgtlr cr7
+ li rRTN, -1
+ blr
+ .align 4
+L(duLcr7):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+ li rRTN, 1
+ bgt cr7, L(dureturn29)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
+ li rRTN, -1
+ b L(dureturn27)
+ .align 4
+L(duLcr1):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+ li rRTN, 1
+ bgt cr1, L(dureturn29)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
+ li rRTN, -1
+ b L(dureturn27)
+ .align 4
+L(duLcr6):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+ li rRTN, 1
+ bgt cr6, L(dureturn29)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
+ li rRTN, -1
+ b L(dureturn27)
+ .align 4
+L(duLcr5):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+ li rRTN, 1
+ bgt cr5, L(dureturn29)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
+ li rRTN, -1
+ b L(dureturn27)
+
+ .align 3
+L(duZeroReturn):
+ li rRTN, 0
+ .align 4
+L(dureturn):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+L(dureturn29):
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
+L(dureturn27):
+ ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+ ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+ ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+L(dureturn24):
+ ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ blr
+
+L(duzeroLength):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 0
+ blr
+
+END (MEMCMP)
+libc_hidden_builtin_def (memcmp)
+weak_alias (memcmp, bcmp)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memcpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memcpy.S
new file mode 100644
index 0000000000..e08993cbc3
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memcpy.S
@@ -0,0 +1,430 @@
+/* Optimized memcpy implementation for PowerPC64/POWER7.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+
+/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+ Returns 'dst'. */
+
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+#define dst 11 /* Use r11 so r3 kept unchanged. */
+#define src 4
+#define cnt 5
+
+ .machine power7
+EALIGN (MEMCPY, 5, 0)
+ CALL_MCOUNT 3
+
+ cmpldi cr1,cnt,31
+ neg 0,3
+ ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
+ code. */
+
+/* Align copies using VSX instructions to quadword. It is to avoid alignment
+ traps when memcpy is used on non-cacheable memory (for instance, memory
+ mapped I/O). */
+ andi. 10,3,15
+ clrldi 11,4,60
+ cmpld cr6,10,11 /* SRC and DST alignments match? */
+
+ mr dst,3
+ bne cr6,L(copy_GE_32_unaligned)
+ beq L(aligned_copy)
+
+ mtocrf 0x01,0
+ clrldi 0,0,60
+
+/* Get the DST and SRC aligned to 16 bytes. */
+1:
+ bf 31,2f
+ lbz 6,0(src)
+ addi src,src,1
+ stb 6,0(dst)
+ addi dst,dst,1
+2:
+ bf 30,4f
+ lhz 6,0(src)
+ addi src,src,2
+ sth 6,0(dst)
+ addi dst,dst,2
+4:
+ bf 29,8f
+ lwz 6,0(src)
+ addi src,src,4
+ stw 6,0(dst)
+ addi dst,dst,4
+8:
+ bf 28,16f
+ ld 6,0(src)
+ addi src,src,8
+ std 6,0(dst)
+ addi dst,dst,8
+16:
+ subf cnt,0,cnt
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy):
+ li 6,16
+ li 7,32
+ li 8,48
+ mtocrf 0x02,cnt
+ srdi 12,cnt,7
+ cmpdi 12,0
+ beq L(aligned_tail)
+ lxvd2x 6,0,src
+ lxvd2x 7,src,6
+ mtctr 12
+ b L(aligned_128loop)
+
+ .align 4
+L(aligned_128head):
+ /* for the 2nd + iteration of this loop. */
+ lxvd2x 6,0,src
+ lxvd2x 7,src,6
+L(aligned_128loop):
+ lxvd2x 8,src,7
+ lxvd2x 9,src,8
+ stxvd2x 6,0,dst
+ addi src,src,64
+ stxvd2x 7,dst,6
+ stxvd2x 8,dst,7
+ stxvd2x 9,dst,8
+ lxvd2x 6,0,src
+ lxvd2x 7,src,6
+ addi dst,dst,64
+ lxvd2x 8,src,7
+ lxvd2x 9,src,8
+ addi src,src,64
+ stxvd2x 6,0,dst
+ stxvd2x 7,dst,6
+ stxvd2x 8,dst,7
+ stxvd2x 9,dst,8
+ addi dst,dst,64
+ bdnz L(aligned_128head)
+
+L(aligned_tail):
+ mtocrf 0x01,cnt
+ bf 25,32f
+ lxvd2x 6,0,src
+ lxvd2x 7,src,6
+ lxvd2x 8,src,7
+ lxvd2x 9,src,8
+ addi src,src,64
+ stxvd2x 6,0,dst
+ stxvd2x 7,dst,6
+ stxvd2x 8,dst,7
+ stxvd2x 9,dst,8
+ addi dst,dst,64
+32:
+ bf 26,16f
+ lxvd2x 6,0,src
+ lxvd2x 7,src,6
+ addi src,src,32
+ stxvd2x 6,0,dst
+ stxvd2x 7,dst,6
+ addi dst,dst,32
+16:
+ bf 27,8f
+ lxvd2x 6,0,src
+ addi src,src,16
+ stxvd2x 6,0,dst
+ addi dst,dst,16
+8:
+ bf 28,4f
+ ld 6,0(src)
+ addi src,src,8
+ std 6,0(dst)
+ addi dst,dst,8
+4: /* Copies 4~7 bytes. */
+ bf 29,L(tail2)
+ lwz 6,0(src)
+ stw 6,0(dst)
+ bf 30,L(tail5)
+ lhz 7,4(src)
+ sth 7,4(dst)
+ bflr 31
+ lbz 8,6(src)
+ stb 8,6(dst)
+ /* Return original DST pointer. */
+ blr
+
+
+/* Handle copies of 0~31 bytes. */
+ .align 4
+L(copy_LT_32):
+ mr dst,3
+ cmpldi cr6,cnt,8
+ mtocrf 0x01,cnt
+ ble cr6,L(copy_LE_8)
+
+ /* At least 9 bytes to go. */
+ neg 8,4
+ andi. 0,8,3
+ cmpldi cr1,cnt,16
+ beq L(copy_LT_32_aligned)
+
+ /* Force 4-byte alignment for SRC. */
+ mtocrf 0x01,0
+ subf cnt,0,cnt
+2:
+ bf 30,1f
+ lhz 6,0(src)
+ addi src,src,2
+ sth 6,0(dst)
+ addi dst,dst,2
+1:
+ bf 31,L(end_4bytes_alignment)
+ lbz 6,0(src)
+ addi src,src,1
+ stb 6,0(dst)
+ addi dst,dst,1
+
+ .align 4
+L(end_4bytes_alignment):
+ cmpldi cr1,cnt,16
+ mtocrf 0x01,cnt
+
+L(copy_LT_32_aligned):
+ /* At least 6 bytes to go, and SRC is word-aligned. */
+ blt cr1,8f
+
+ /* Copy 16 bytes. */
+ lwz 6,0(src)
+ lwz 7,4(src)
+ stw 6,0(dst)
+ lwz 8,8(src)
+ stw 7,4(dst)
+ lwz 6,12(src)
+ addi src,src,16
+ stw 8,8(dst)
+ stw 6,12(dst)
+ addi dst,dst,16
+8: /* Copy 8 bytes. */
+ bf 28,L(tail4)
+ lwz 6,0(src)
+ lwz 7,4(src)
+ addi src,src,8
+ stw 6,0(dst)
+ stw 7,4(dst)
+ addi dst,dst,8
+
+ .align 4
+/* Copies 4~7 bytes. */
+L(tail4):
+ bf 29,L(tail2)
+ lwz 6,0(src)
+ stw 6,0(dst)
+ bf 30,L(tail5)
+ lhz 7,4(src)
+ sth 7,4(dst)
+ bflr 31
+ lbz 8,6(src)
+ stb 8,6(dst)
+ /* Return original DST pointer. */
+ blr
+
+ .align 4
+/* Copies 2~3 bytes. */
+L(tail2):
+ bf 30,1f
+ lhz 6,0(src)
+ sth 6,0(dst)
+ bflr 31
+ lbz 7,2(src)
+ stb 7,2(dst)
+ blr
+
+ .align 4
+L(tail5):
+ bflr 31
+ lbz 6,4(src)
+ stb 6,4(dst)
+ blr
+
+ .align 4
+1:
+ bflr 31
+ lbz 6,0(src)
+ stb 6,0(dst)
+ /* Return original DST pointer. */
+ blr
+
+
+/* Handles copies of 0~8 bytes. */
+ .align 4
+L(copy_LE_8):
+ bne cr6,L(tail4)
+
+ /* Though we could've used ld/std here, they are still
+ slow for unaligned cases. */
+
+ lwz 6,0(src)
+ lwz 7,4(src)
+ stw 6,0(dst)
+ stw 7,4(dst)
+ blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+ SRC is not. Use aligned quadword loads from SRC, shifted to realign
+ the data, allowing for aligned DST stores. */
+ .align 4
+L(copy_GE_32_unaligned):
+ clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */
+ srdi 9,cnt,4 /* Number of full quadwords remaining. */
+
+ beq L(copy_GE_32_unaligned_cont)
+
+ /* DST is not quadword aligned, get it aligned. */
+
+ mtocrf 0x01,0
+ subf cnt,0,cnt
+
+ /* Vector instructions work best when proper alignment (16-bytes)
+ is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
+1:
+ bf 31,2f
+ lbz 6,0(src)
+ addi src,src,1
+ stb 6,0(dst)
+ addi dst,dst,1
+2:
+ bf 30,4f
+ lhz 6,0(src)
+ addi src,src,2
+ sth 6,0(dst)
+ addi dst,dst,2
+4:
+ bf 29,8f
+ lwz 6,0(src)
+ addi src,src,4
+ stw 6,0(dst)
+ addi dst,dst,4
+8:
+ bf 28,0f
+ ld 6,0(src)
+ addi src,src,8
+ std 6,0(dst)
+ addi dst,dst,8
+0:
+ srdi 9,cnt,4 /* Number of full quadwords remaining. */
+
+ /* The proper alignment is present, it is OK to copy the bytes now. */
+L(copy_GE_32_unaligned_cont):
+
+ /* Setup two indexes to speed up the indexed vector operations. */
+ clrldi 10,cnt,60
+ li 6,16 /* Index for 16-bytes offsets. */
+ li 7,32 /* Index for 32-bytes offsets. */
+ cmpldi cr1,10,0
+ srdi 8,cnt,5 /* Setup the loop counter. */
+ mtocrf 0x01,9
+ cmpldi cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+ lvsr 5,0,src
+#else
+ lvsl 5,0,src
+#endif
+ lvx 3,0,src
+ li 0,0
+ bf 31,L(setup_unaligned_loop)
+
+ /* Copy another 16 bytes to align to 32-bytes due to the loop. */
+ lvx 4,src,6
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
+ addi src,src,16
+ stvx 6,0,dst
+ addi dst,dst,16
+ vor 3,4,4
+ clrrdi 0,src,60
+
+L(setup_unaligned_loop):
+ mtctr 8
+ ble cr6,L(end_unaligned_loop)
+
+ /* Copy 32 bytes at a time using vector instructions. */
+ .align 4
+L(unaligned_loop):
+
+ /* Note: vr6/vr10 may contain data that was already copied,
+ but in order to get proper alignment, we may have to copy
+ some portions again. This is faster than having unaligned
+ vector instructions though. */
+
+ lvx 4,src,6
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
+ lvx 3,src,7
+#ifdef __LITTLE_ENDIAN__
+ vperm 10,3,4,5
+#else
+ vperm 10,4,3,5
+#endif
+ addi src,src,32
+ stvx 6,0,dst
+ stvx 10,dst,6
+ addi dst,dst,32
+ bdnz L(unaligned_loop)
+
+ clrrdi 0,src,60
+
+ .align 4
+L(end_unaligned_loop):
+
+ /* Check for tail bytes. */
+ mtocrf 0x01,cnt
+ beqlr cr1
+
+ add src,src,0
+
+ /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
+ /* Copy 8 bytes. */
+ bf 28,4f
+ lwz 6,0(src)
+ lwz 7,4(src)
+ addi src,src,8
+ stw 6,0(dst)
+ stw 7,4(dst)
+ addi dst,dst,8
+4: /* Copy 4~7 bytes. */
+ bf 29,L(tail2)
+ lwz 6,0(src)
+ stw 6,0(dst)
+ bf 30,L(tail5)
+ lhz 7,4(src)
+ sth 7,4(dst)
+ bflr 31
+ lbz 8,6(src)
+ stb 8,6(dst)
+ /* Return original DST pointer. */
+ blr
+
+END_GEN_TB (MEMCPY,TB_TOCLESS)
+libc_hidden_builtin_def (memcpy)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memmove.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memmove.S
new file mode 100644
index 0000000000..4c0f7c3571
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memmove.S
@@ -0,0 +1,835 @@
+/* Optimized memmove implementation for PowerPC64/POWER7.
+ Copyright (C) 2014-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+
+/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
+
+ This optimization check if memory 'dest' overlaps with 'src'. If it does
+ not then it calls an optimized memcpy call (similar to memcpy for POWER7,
+ embedded here to gain some cycles).
+ If source and destiny overlaps, a optimized backwards memcpy is used
+ instead. */
+
+#ifndef MEMMOVE
+# define MEMMOVE memmove
+#endif
+ .machine power7
+EALIGN (MEMMOVE, 5, 0)
+ CALL_MCOUNT 3
+
+L(_memmove):
+ subf r9,r4,r3
+ cmpld cr7,r9,r5
+ blt cr7,L(memmove_bwd)
+
+ cmpldi cr1,r5,31
+ neg 0,3
+ ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
+ code. */
+
+ andi. 10,3,15
+ clrldi 11,4,60
+ cmpld cr6,10,11 /* SRC and DST alignments match? */
+
+ mr r11,3
+ bne cr6,L(copy_GE_32_unaligned)
+ beq L(aligned_copy)
+
+ mtocrf 0x01,0
+ clrldi 0,0,60
+
+/* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */
+1:
+ bf 31,2f
+ lbz 6,0(r4)
+ addi r4,r4,1
+ stb 6,0(r11)
+ addi r11,r11,1
+2:
+ bf 30,4f
+ lhz 6,0(r4)
+ addi r4,r4,2
+ sth 6,0(r11)
+ addi r11,r11,2
+4:
+ bf 29,8f
+ lwz 6,0(r4)
+ addi r4,r4,4
+ stw 6,0(r11)
+ addi r11,r11,4
+8:
+ bf 28,16f
+ ld 6,0(r4)
+ addi r4,r4,8
+ std 6,0(r11)
+ addi r11,r11,8
+16:
+ subf r5,0,r5
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy):
+ li 6,16
+ li 7,32
+ li 8,48
+ mtocrf 0x02,r5
+ srdi 12,r5,7
+ cmpdi 12,0
+ beq L(aligned_tail)
+ lxvd2x 6,0,r4
+ lxvd2x 7,r4,6
+ mtctr 12
+ b L(aligned_128loop)
+
+ .align 4
+L(aligned_128head):
+ /* for the 2nd + iteration of this loop. */
+ lxvd2x 6,0,r4
+ lxvd2x 7,r4,6
+L(aligned_128loop):
+ lxvd2x 8,r4,7
+ lxvd2x 9,r4,8
+ stxvd2x 6,0,r11
+ addi r4,r4,64
+ stxvd2x 7,r11,6
+ stxvd2x 8,r11,7
+ stxvd2x 9,r11,8
+ lxvd2x 6,0,r4
+ lxvd2x 7,r4,6
+ addi r11,r11,64
+ lxvd2x 8,r4,7
+ lxvd2x 9,r4,8
+ addi r4,r4,64
+ stxvd2x 6,0,r11
+ stxvd2x 7,r11,6
+ stxvd2x 8,r11,7
+ stxvd2x 9,r11,8
+ addi r11,r11,64
+ bdnz L(aligned_128head)
+
+L(aligned_tail):
+ mtocrf 0x01,r5
+ bf 25,32f
+ lxvd2x 6,0,r4
+ lxvd2x 7,r4,6
+ lxvd2x 8,r4,7
+ lxvd2x 9,r4,8
+ addi r4,r4,64
+ stxvd2x 6,0,r11
+ stxvd2x 7,r11,6
+ stxvd2x 8,r11,7
+ stxvd2x 9,r11,8
+ addi r11,r11,64
+32:
+ bf 26,16f
+ lxvd2x 6,0,r4
+ lxvd2x 7,r4,6
+ addi r4,r4,32
+ stxvd2x 6,0,r11
+ stxvd2x 7,r11,6
+ addi r11,r11,32
+16:
+ bf 27,8f
+ lxvd2x 6,0,r4
+ addi r4,r4,16
+ stxvd2x 6,0,r11
+ addi r11,r11,16
+8:
+ bf 28,4f
+ ld 6,0(r4)
+ addi r4,r4,8
+ std 6,0(r11)
+ addi r11,r11,8
+4: /* Copies 4~7 bytes. */
+ bf 29,L(tail2)
+ lwz 6,0(r4)
+ stw 6,0(r11)
+ bf 30,L(tail5)
+ lhz 7,4(r4)
+ sth 7,4(r11)
+ bflr 31
+ lbz 8,6(r4)
+ stb 8,6(r11)
+ /* Return original DST pointer. */
+ blr
+
+/* Handle copies of 0~31 bytes. */
+ .align 4
+L(copy_LT_32):
+ mr r11,3
+ cmpldi cr6,r5,8
+ mtocrf 0x01,r5
+ ble cr6,L(copy_LE_8)
+
+ /* At least 9 bytes to go. */
+ neg 8,4
+ andi. 0,8,3
+ cmpldi cr1,r5,16
+ beq L(copy_LT_32_aligned)
+
+ /* Force 4-byte alignment for SRC. */
+ mtocrf 0x01,0
+ subf r5,0,r5
+2:
+ bf 30,1f
+ lhz 6,0(r4)
+ addi r4,r4,2
+ sth 6,0(r11)
+ addi r11,r11,2
+1:
+ bf 31,L(end_4bytes_alignment)
+ lbz 6,0(r4)
+ addi r4,r4,1
+ stb 6,0(r11)
+ addi r11,r11,1
+
+ .align 4
+L(end_4bytes_alignment):
+ cmpldi cr1,r5,16
+ mtocrf 0x01,r5
+
+L(copy_LT_32_aligned):
+ /* At least 6 bytes to go, and SRC is word-aligned. */
+ blt cr1,8f
+
+ /* Copy 16 bytes. */
+ lwz 6,0(r4)
+ lwz 7,4(r4)
+ stw 6,0(r11)
+ lwz 8,8(r4)
+ stw 7,4(r11)
+ lwz 6,12(r4)
+ addi r4,r4,16
+ stw 8,8(r11)
+ stw 6,12(r11)
+ addi r11,r11,16
+8: /* Copy 8 bytes. */
+ bf 28,L(tail4)
+ lwz 6,0(r4)
+ lwz 7,4(r4)
+ addi r4,r4,8
+ stw 6,0(r11)
+ stw 7,4(r11)
+ addi r11,r11,8
+
+ .align 4
+/* Copies 4~7 bytes. */
+L(tail4):
+ bf 29,L(tail2)
+ lwz 6,0(r4)
+ stw 6,0(r11)
+ bf 30,L(tail5)
+ lhz 7,4(r4)
+ sth 7,4(r11)
+ bflr 31
+ lbz 8,6(r4)
+ stb 8,6(r11)
+ /* Return original DST pointer. */
+ blr
+
+ .align 4
+/* Copies 2~3 bytes. */
+L(tail2):
+ bf 30,1f
+ lhz 6,0(r4)
+ sth 6,0(r11)
+ bflr 31
+ lbz 7,2(r4)
+ stb 7,2(r11)
+ blr
+
+ .align 4
+L(tail5):
+ bflr 31
+ lbz 6,4(r4)
+ stb 6,4(r11)
+ blr
+
+ .align 4
+1:
+ bflr 31
+ lbz 6,0(r4)
+ stb 6,0(r11)
+ /* Return original DST pointer. */
+ blr
+
+/* Handles copies of 0~8 bytes. */
+ .align 4
+L(copy_LE_8):
+ bne cr6,L(tail4)
+
+ /* Though we could've used ld/std here, they are still
+ slow for unaligned cases. */
+
+ lwz 6,0(r4)
+ lwz 7,4(r4)
+ stw 6,0(r11)
+ stw 7,4(r11)
+ blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+ SRC is not. Use aligned quadword loads from SRC, shifted to realign
+ the data, allowing for aligned DST stores. */
+ .align 4
+L(copy_GE_32_unaligned):
+ clrldi 0,0,60 /* Number of bytes until the 1st r11 quadword. */
+ srdi 9,r5,4 /* Number of full quadwords remaining. */
+
+ beq L(copy_GE_32_unaligned_cont)
+
+ /* DST is not quadword aligned, get it aligned. */
+
+ mtocrf 0x01,0
+ subf r5,0,r5
+
+ /* Vector instructions work best when proper alignment (16-bytes)
+ is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
+1:
+ bf 31,2f
+ lbz 6,0(r4)
+ addi r4,r4,1
+ stb 6,0(r11)
+ addi r11,r11,1
+2:
+ bf 30,4f
+ lhz 6,0(r4)
+ addi r4,r4,2
+ sth 6,0(r11)
+ addi r11,r11,2
+4:
+ bf 29,8f
+ lwz 6,0(r4)
+ addi r4,r4,4
+ stw 6,0(r11)
+ addi r11,r11,4
+8:
+ bf 28,0f
+ ld 6,0(r4)
+ addi r4,r4,8
+ std 6,0(r11)
+ addi r11,r11,8
+0:
+ srdi 9,r5,4 /* Number of full quadwords remaining. */
+
+ /* The proper alignment is present, it is OK to copy the bytes now. */
+L(copy_GE_32_unaligned_cont):
+
+ /* Setup two indexes to speed up the indexed vector operations. */
+ clrldi 10,r5,60
+ li 6,16 /* Index for 16-bytes offsets. */
+ li 7,32 /* Index for 32-bytes offsets. */
+ cmpldi cr1,10,0
+ srdi 8,r5,5 /* Setup the loop counter. */
+ mtocrf 0x01,9
+ cmpldi cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+ lvsr 5,0,r4
+#else
+ lvsl 5,0,r4
+#endif
+ lvx 3,0,r4
+ li 0,0
+ bf 31,L(setup_unaligned_loop)
+
+ /* Copy another 16 bytes to align to 32-bytes due to the loop. */
+ lvx 4,r4,6
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
+ addi r4,r4,16
+ stvx 6,0,r11
+ addi r11,r11,16
+ vor 3,4,4
+ clrrdi 0,r4,60
+
+L(setup_unaligned_loop):
+ mtctr 8
+ ble cr6,L(end_unaligned_loop)
+
+ /* Copy 32 bytes at a time using vector instructions. */
+ .align 4
+L(unaligned_loop):
+
+ /* Note: vr6/vr10 may contain data that was already copied,
+ but in order to get proper alignment, we may have to copy
+ some portions again. This is faster than having unaligned
+ vector instructions though. */
+
+ lvx 4,r4,6
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
+ lvx 3,r4,7
+#ifdef __LITTLE_ENDIAN__
+ vperm 10,3,4,5
+#else
+ vperm 10,4,3,5
+#endif
+ addi r4,r4,32
+ stvx 6,0,r11
+ stvx 10,r11,6
+ addi r11,r11,32
+ bdnz L(unaligned_loop)
+
+ clrrdi 0,r4,60
+
+ .align 4
+L(end_unaligned_loop):
+
+ /* Check for tail bytes. */
+ mtocrf 0x01,r5
+ beqlr cr1
+
+ add r4,r4,0
+
+ /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
+ /* Copy 8 bytes. */
+ bf 28,4f
+ lwz 6,0(r4)
+ lwz 7,4(r4)
+ addi r4,r4,8
+ stw 6,0(r11)
+ stw 7,4(r11)
+ addi r11,r11,8
+4: /* Copy 4~7 bytes. */
+ bf 29,L(tail2)
+ lwz 6,0(r4)
+ stw 6,0(r11)
+ bf 30,L(tail5)
+ lhz 7,4(r4)
+ sth 7,4(r11)
+ bflr 31
+ lbz 8,6(r4)
+ stb 8,6(r11)
+ /* Return original DST pointer. */
+ blr
+
+ /* Start to memcpy backward implementation: the algorith first check if
+ src and dest have the same alignment and if it does align both to 16
+ bytes and copy using VSX instructions.
+ If does not, align dest to 16 bytes and use VMX (altivec) instruction
+ to read two 16 bytes at time, shift/permute the bytes read and write
+ aligned to dest. */
+L(memmove_bwd):
+ cmpldi cr1,r5,31
+ /* Copy is done backwards: update the pointers and check alignment. */
+ add r11,r3,r5
+ add r4,r4,r5
+ mr r0,r11
+ ble cr1, L(copy_LT_32_bwd) /* If move < 32 bytes use short move
+ code. */
+
+ andi. r10,r11,15 /* Check if r11 is aligned to 16 bytes */
+ clrldi r9,r4,60 /* Check if r4 is aligned to 16 bytes */
+ cmpld cr6,r10,r9 /* SRC and DST alignments match? */
+
+ bne cr6,L(copy_GE_32_unaligned_bwd)
+ beq L(aligned_copy_bwd)
+
+ mtocrf 0x01,r0
+ clrldi r0,r0,60
+
+/* Get the DST and SRC aligned to 16 bytes. */
+1:
+ bf 31,2f
+ lbz r6,-1(r4)
+ subi r4,r4,1
+ stb r6,-1(r11)
+ subi r11,r11,1
+2:
+ bf 30,4f
+ lhz r6,-2(r4)
+ subi r4,r4,2
+ sth r6,-2(r11)
+ subi r11,r11,2
+4:
+ bf 29,8f
+ lwz r6,-4(r4)
+ subi r4,r4,4
+ stw r6,-4(r11)
+ subi r11,r11,4
+8:
+ bf 28,16f
+ ld r6,-8(r4)
+ subi r4,r4,8
+ std r6,-8(r11)
+ subi r11,r11,8
+16:
+ subf r5,0,r5
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy_bwd):
+ li r6,-16
+ li r7,-32
+ li r8,-48
+ li r9,-64
+ mtocrf 0x02,r5
+ srdi r12,r5,7
+ cmpdi r12,0
+ beq L(aligned_tail_bwd)
+ lxvd2x v6,r4,r6
+ lxvd2x v7,r4,r7
+ mtctr 12
+ b L(aligned_128loop_bwd)
+
+ .align 4
+L(aligned_128head_bwd):
+ /* for the 2nd + iteration of this loop. */
+ lxvd2x v6,r4,r6
+ lxvd2x v7,r4,r7
+L(aligned_128loop_bwd):
+ lxvd2x v8,r4,r8
+ lxvd2x v9,r4,r9
+ stxvd2x v6,r11,r6
+ subi r4,r4,64
+ stxvd2x v7,r11,r7
+ stxvd2x v8,r11,r8
+ stxvd2x v9,r11,r9
+ lxvd2x v6,r4,r6
+ lxvd2x v7,r4,7
+ subi r11,r11,64
+ lxvd2x v8,r4,r8
+ lxvd2x v9,r4,r9
+ subi r4,r4,64
+ stxvd2x v6,r11,r6
+ stxvd2x v7,r11,r7
+ stxvd2x v8,r11,r8
+ stxvd2x v9,r11,r9
+ subi r11,r11,64
+ bdnz L(aligned_128head_bwd)
+
+L(aligned_tail_bwd):
+ mtocrf 0x01,r5
+ bf 25,32f
+ lxvd2x v6,r4,r6
+ lxvd2x v7,r4,r7
+ lxvd2x v8,r4,r8
+ lxvd2x v9,r4,r9
+ subi r4,r4,64
+ stxvd2x v6,r11,r6
+ stxvd2x v7,r11,r7
+ stxvd2x v8,r11,r8
+ stxvd2x v9,r11,r9
+ subi r11,r11,64
+32:
+ bf 26,16f
+ lxvd2x v6,r4,r6
+ lxvd2x v7,r4,r7
+ subi r4,r4,32
+ stxvd2x v6,r11,r6
+ stxvd2x v7,r11,r7
+ subi r11,r11,32
+16:
+ bf 27,8f
+ lxvd2x v6,r4,r6
+ subi r4,r4,16
+ stxvd2x v6,r11,r6
+ subi r11,r11,16
+8:
+ bf 28,4f
+ ld r6,-8(r4)
+ subi r4,r4,8
+ std r6,-8(r11)
+ subi r11,r11,8
+4: /* Copies 4~7 bytes. */
+ bf 29,L(tail2_bwd)
+ lwz r6,-4(r4)
+ stw r6,-4(r11)
+ bf 30,L(tail5_bwd)
+ lhz r7,-6(r4)
+ sth r7,-6(r11)
+ bflr 31
+ lbz r8,-7(r4)
+ stb r8,-7(r11)
+ /* Return original DST pointer. */
+ blr
+
+/* Handle copies of 0~31 bytes. */
+ .align 4
+L(copy_LT_32_bwd):
+ cmpldi cr6,r5,8
+ mtocrf 0x01,r5
+ ble cr6,L(copy_LE_8_bwd)
+
+ /* At least 9 bytes to go. */
+ neg r8,r4
+ andi. r0,r8,3
+ cmpldi cr1,r5,16
+ beq L(copy_LT_32_aligned_bwd)
+
+ /* Force 4-byte alignment for SRC. */
+ mtocrf 0x01,0
+ subf r5,0,r5
+2:
+ bf 30,1f
+ lhz r6,-2(r4)
+ subi r4,r4,2
+ sth r6,-2(r11)
+ subi r11,r11,2
+1:
+ bf 31,L(end_4bytes_alignment_bwd)
+ lbz 6,-1(r4)
+ subi r4,r4,1
+ stb 6,-1(r11)
+ subi r11,r11,1
+
+ .align 4
+L(end_4bytes_alignment_bwd):
+ cmpldi cr1,r5,16
+ mtocrf 0x01,r5
+
+L(copy_LT_32_aligned_bwd):
+ /* At least 6 bytes to go, and SRC is word-aligned. */
+ blt cr1,8f
+
+ /* Copy 16 bytes. */
+ lwz r6,-4(r4)
+ lwz r7,-8(r4)
+ stw r6,-4(r11)
+ lwz r8,-12(r4)
+ stw r7,-8(r11)
+ lwz r6,-16(r4)
+ subi r4,r4,16
+ stw r8,-12(r11)
+ stw r6,-16(r11)
+ subi r11,r11,16
+8: /* Copy 8 bytes. */
+ bf 28,L(tail4_bwd)
+ lwz r6,-4(r4)
+ lwz r7,-8(r4)
+ subi r4,r4,8
+ stw r6,-4(r11)
+ stw r7,-8(r11)
+ subi r11,r11,8
+
+ .align 4
+/* Copies 4~7 bytes. */
+L(tail4_bwd):
+ bf 29,L(tail2_bwd)
+ lwz 6,-4(r4)
+ stw 6,-4(r11)
+ bf 30,L(tail5_bwd)
+ lhz 7,-6(r4)
+ sth 7,-6(r11)
+ bflr 31
+ lbz 8,-7(r4)
+ stb 8,-7(r11)
+ /* Return original DST pointer. */
+ blr
+
+ .align 4
+/* Copies 2~3 bytes. */
+L(tail2_bwd):
+ bf 30,1f
+ lhz 6,-2(r4)
+ sth 6,-2(r11)
+ bflr 31
+ lbz 7,-3(r4)
+ stb 7,-3(r11)
+ blr
+
+ .align 4
+L(tail5_bwd):
+ bflr 31
+ lbz 6,-5(r4)
+ stb 6,-5(r11)
+ blr
+
+ .align 4
+1:
+ bflr 31
+ lbz 6,-1(r4)
+ stb 6,-1(r11)
+ /* Return original DST pointer. */
+ blr
+
+
+/* Handles copies of 0~8 bytes. */
+ .align 4
+L(copy_LE_8_bwd):
+ bne cr6,L(tail4_bwd)
+
+ /* Though we could've used ld/std here, they are still
+ slow for unaligned cases. */
+ lwz 6,-8(r4)
+ lwz 7,-4(r4)
+ stw 6,-8(r11)
+ stw 7,-4(r11)
+ blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+ SRC is not. Use aligned quadword loads from SRC, shifted to realign
+ the data, allowing for aligned DST stores. */
+ .align 4
+L(copy_GE_32_unaligned_bwd):
+ andi. r10,r11,15 /* Check alignment of DST against 16 bytes.. */
+ srdi r9,r5,4 /* Number of full quadwords remaining. */
+
+ beq L(copy_GE_32_unaligned_cont_bwd)
+
+ /* DST is not quadword aligned and r10 holds the address masked to
+ compare alignments. */
+ mtocrf 0x01,r10
+ subf r5,r10,r5
+
+ /* Vector instructions work best when proper alignment (16-bytes)
+ is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
+1:
+ bf 31,2f
+ lbz r6,-1(r4)
+ subi r4,r4,1
+ stb r6,-1(r11)
+ subi r11,r11,1
+2:
+ bf 30,4f
+ lhz r6,-2(r4)
+ subi r4,r4,2
+ sth r6,-2(r11)
+ subi r11,r11,2
+4:
+ bf 29,8f
+ lwz r6,-4(r4)
+ subi r4,r4,4
+ stw r6,-4(r11)
+ subi r11,r11,4
+8:
+ bf 28,0f
+ ld r6,-8(r4)
+ subi r4,r4,8
+ std r6,-8(r11)
+ subi r11,r11,8
+0:
+ srdi r9,r5,4 /* Number of full quadwords remaining. */
+
+ /* The proper alignment is present, it is OK to copy the bytes now. */
+L(copy_GE_32_unaligned_cont_bwd):
+
+ /* Setup two indexes to speed up the indexed vector operations. */
+ clrldi r10,r5,60
+ li r6,-16 /* Index for 16-bytes offsets. */
+ li r7,-32 /* Index for 32-bytes offsets. */
+ cmpldi cr1,10,0
+ srdi r8,r5,5 /* Setup the loop counter. */
+ mtocrf 0x01,9
+ cmpldi cr6,r9,1
+#ifdef __LITTLE_ENDIAN__
+ lvsr v5,r0,r4
+#else
+ lvsl v5,r0,r4
+#endif
+ lvx v3,0,r4
+ li r0,0
+ bf 31,L(setup_unaligned_loop_bwd)
+
+ /* Copy another 16 bytes to align to 32-bytes due to the loop. */
+ lvx v4,r4,r6
+#ifdef __LITTLE_ENDIAN__
+ vperm v6,v3,v4,v5
+#else
+ vperm v6,v4,v3,v5
+#endif
+ subi r4,r4,16
+ stvx v6,r11,r6
+ subi r11,r11,16
+ vor v3,v4,v4
+ clrrdi r0,r4,60
+
+L(setup_unaligned_loop_bwd):
+ mtctr r8
+ ble cr6,L(end_unaligned_loop_bwd)
+
+ /* Copy 32 bytes at a time using vector instructions. */
+ .align 4
+L(unaligned_loop_bwd):
+
+ /* Note: vr6/vr10 may contain data that was already copied,
+ but in order to get proper alignment, we may have to copy
+ some portions again. This is faster than having unaligned
+ vector instructions though. */
+
+ lvx v4,r4,r6
+#ifdef __LITTLE_ENDIAN__
+ vperm v6,v3,v4,v5
+#else
+ vperm v6,v4,v3,v5
+#endif
+ lvx v3,r4,r7
+#ifdef __LITTLE_ENDIAN__
+ vperm v10,v4,v3,v5
+#else
+ vperm v10,v3,v4,v5
+#endif
+ subi r4,r4,32
+ stvx v6,r11,r6
+ stvx v10,r11,r7
+ subi r11,r11,32
+ bdnz L(unaligned_loop_bwd)
+
+ clrrdi r0,r4,60
+
+ .align 4
+L(end_unaligned_loop_bwd):
+
+ /* Check for tail bytes. */
+ mtocrf 0x01,r5
+ beqlr cr1
+
+ add r4,r4,0
+
+ /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
+ /* Copy 8 bytes. */
+ bf 28,4f
+ lwz r6,-4(r4)
+ lwz r7,-8(r4)
+ subi r4,r4,8
+ stw r6,-4(r11)
+ stw r7,-8(r11)
+ subi r11,r11,8
+4: /* Copy 4~7 bytes. */
+ bf 29,L(tail2_bwd)
+ lwz r6,-4(r4)
+ stw r6,-4(r11)
+ bf 30,L(tail5_bwd)
+ lhz r7,-6(r4)
+ sth r7,-6(r11)
+ bflr 31
+ lbz r8,-7(r4)
+ stb r8,-7(r11)
+ /* Return original DST pointer. */
+ blr
+END_GEN_TB (MEMMOVE, TB_TOCLESS)
+libc_hidden_builtin_def (memmove)
+
+
+/* void bcopy(const void *src [r3], void *dest [r4], size_t n [r5])
+ Implemented in this file to avoid linker create a stub function call
+ in the branch to '_memmove'. */
+ENTRY (__bcopy)
+ mr r6,r3
+ mr r3,r4
+ mr r4,r6
+ b L(_memmove)
+END (__bcopy)
+weak_alias (__bcopy, bcopy)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/mempcpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/mempcpy.S
new file mode 100644
index 0000000000..4e15d1e40c
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/mempcpy.S
@@ -0,0 +1,472 @@
+/* Optimized mempcpy implementation for POWER7.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+
+/* __ptr_t [r3] __mempcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+ Returns 'dst' + 'len'. */
+
+#ifndef MEMPCPY
+# define MEMPCPY __mempcpy
+#endif
+ .machine power7
+EALIGN (MEMPCPY, 5, 0)
+ CALL_MCOUNT 3
+
+ cmpldi cr1,5,31
+ neg 0,3
+ std 3,-16(1)
+ std 31,-8(1)
+ cfi_offset(31,-8)
+ ble cr1,L(copy_LT_32) /* If move < 32 bytes use short move
+ code. */
+
+ andi. 11,3,7 /* Check alignment of DST. */
+
+
+ clrldi 10,4,61 /* Check alignment of SRC. */
+ cmpld cr6,10,11 /* SRC and DST alignments match? */
+ mr 12,4
+ mr 31,5
+ bne cr6,L(copy_GE_32_unaligned)
+
+ srdi 9,5,3 /* Number of full quadwords remaining. */
+
+ beq L(copy_GE_32_aligned_cont)
+
+ clrldi 0,0,61
+ mtcrf 0x01,0
+ subf 31,0,5
+
+ /* Get the SRC aligned to 8 bytes. */
+
+1: bf 31,2f
+ lbz 6,0(12)
+ addi 12,12,1
+ stb 6,0(3)
+ addi 3,3,1
+2: bf 30,4f
+ lhz 6,0(12)
+ addi 12,12,2
+ sth 6,0(3)
+ addi 3,3,2
+4: bf 29,0f
+ lwz 6,0(12)
+ addi 12,12,4
+ stw 6,0(3)
+ addi 3,3,4
+0:
+ clrldi 10,12,61 /* Check alignment of SRC again. */
+ srdi 9,31,3 /* Number of full doublewords remaining. */
+
+L(copy_GE_32_aligned_cont):
+
+ clrldi 11,31,61
+ mtcrf 0x01,9
+
+ srdi 8,31,5
+ cmpldi cr1,9,4
+ cmpldi cr6,11,0
+ mr 11,12
+
+ /* Copy 1~3 doublewords so the main loop starts
+ at a multiple of 32 bytes. */
+
+ bf 30,1f
+ ld 6,0(12)
+ ld 7,8(12)
+ addi 11,12,16
+ mtctr 8
+ std 6,0(3)
+ std 7,8(3)
+ addi 10,3,16
+ bf 31,4f
+ ld 0,16(12)
+ std 0,16(3)
+ blt cr1,3f
+ addi 11,12,24
+ addi 10,3,24
+ b 4f
+
+ .align 4
+1: /* Copy 1 doubleword and set the counter. */
+ mr 10,3
+ mtctr 8
+ bf 31,4f
+ ld 6,0(12)
+ addi 11,12,8
+ std 6,0(3)
+ addi 10,3,8
+
+ /* Main aligned copy loop. Copies 32-bytes at a time. */
+ .align 4
+4:
+ ld 6,0(11)
+ ld 7,8(11)
+ ld 8,16(11)
+ ld 0,24(11)
+ addi 11,11,32
+
+ std 6,0(10)
+ std 7,8(10)
+ std 8,16(10)
+ std 0,24(10)
+ addi 10,10,32
+ bdnz 4b
+3:
+
+ /* Check for tail bytes. */
+ rldicr 0,31,0,60
+ mtcrf 0x01,31
+ beq cr6,0f
+
+.L9:
+ add 3,3,0
+ add 12,12,0
+
+ /* At this point we have a tail of 0-7 bytes and we know that the
+ destination is doubleword-aligned. */
+4: /* Copy 4 bytes. */
+ bf 29,2f
+
+ lwz 6,0(12)
+ addi 12,12,4
+ stw 6,0(3)
+ addi 3,3,4
+2: /* Copy 2 bytes. */
+ bf 30,1f
+
+ lhz 6,0(12)
+ addi 12,12,2
+ sth 6,0(3)
+ addi 3,3,2
+1: /* Copy 1 byte. */
+ bf 31,0f
+
+ lbz 6,0(12)
+ stb 6,0(3)
+0: /* Return DST + LEN pointer. */
+ ld 31,-8(1)
+ ld 3,-16(1)
+ add 3,3,5
+ blr
+
+ /* Handle copies of 0~31 bytes. */
+ .align 4
+L(copy_LT_32):
+ cmpldi cr6,5,8
+ mr 12,4
+ mtcrf 0x01,5
+ ble cr6,L(copy_LE_8)
+
+ /* At least 9 bytes to go. */
+ neg 8,4
+ clrrdi 11,4,2
+ andi. 0,8,3
+ cmpldi cr1,5,16
+ mr 10,5
+ beq L(copy_LT_32_aligned)
+
+ /* Force 4-bytes alignment for SRC. */
+ mtocrf 0x01,0
+ subf 10,0,5
+2: bf 30,1f
+
+ lhz 6,0(12)
+ addi 12,12,2
+ sth 6,0(3)
+ addi 3,3,2
+1: bf 31,L(end_4bytes_alignment)
+
+ lbz 6,0(12)
+ addi 12,12,1
+ stb 6,0(3)
+ addi 3,3,1
+
+ .align 4
+L(end_4bytes_alignment):
+ cmpldi cr1,10,16
+ mtcrf 0x01,10
+
+L(copy_LT_32_aligned):
+ /* At least 6 bytes to go, and SRC is word-aligned. */
+ blt cr1,8f
+
+ /* Copy 16 bytes. */
+ lwz 6,0(12)
+ lwz 7,4(12)
+ stw 6,0(3)
+ lwz 8,8(12)
+ stw 7,4(3)
+ lwz 6,12(12)
+ addi 12,12,16
+ stw 8,8(3)
+ stw 6,12(3)
+ addi 3,3,16
+8: /* Copy 8 bytes. */
+ bf 28,4f
+
+ lwz 6,0(12)
+ lwz 7,4(12)
+ addi 12,12,8
+ stw 6,0(3)
+ stw 7,4(3)
+ addi 3,3,8
+4: /* Copy 4 bytes. */
+ bf 29,2f
+
+ lwz 6,0(12)
+ addi 12,12,4
+ stw 6,0(3)
+ addi 3,3,4
+2: /* Copy 2-3 bytes. */
+ bf 30,1f
+
+ lhz 6,0(12)
+ sth 6,0(3)
+ bf 31,0f
+ lbz 7,2(12)
+ stb 7,2(3)
+ ld 3,-16(1)
+ add 3,3,5
+ blr
+
+ .align 4
+1: /* Copy 1 byte. */
+ bf 31,0f
+
+ lbz 6,0(12)
+ stb 6,0(3)
+0: /* Return DST + LEN pointer. */
+ ld 3,-16(1)
+ add 3,3,5
+ blr
+
+ /* Handles copies of 0~8 bytes. */
+ .align 4
+L(copy_LE_8):
+ bne cr6,4f
+
+ /* Though we could've used ld/std here, they are still
+ slow for unaligned cases. */
+
+ lwz 6,0(4)
+ lwz 7,4(4)
+ stw 6,0(3)
+ stw 7,4(3)
+ ld 3,-16(1) /* Return DST + LEN pointer. */
+ add 3,3,5
+ blr
+
+ .align 4
+4: /* Copies 4~7 bytes. */
+ bf 29,2b
+
+ lwz 6,0(4)
+ stw 6,0(3)
+ bf 30,5f
+ lhz 7,4(4)
+ sth 7,4(3)
+ bf 31,0f
+ lbz 8,6(4)
+ stb 8,6(3)
+ ld 3,-16(1)
+ add 3,3,5
+ blr
+
+ .align 4
+5: /* Copy 1 byte. */
+ bf 31,0f
+
+ lbz 6,4(4)
+ stb 6,4(3)
+
+0: /* Return DST + LEN pointer. */
+ ld 3,-16(1)
+ add 3,3,5
+ blr
+
+ /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+ SRC is not. Use aligned quadword loads from SRC, shifted to realign
+ the data, allowing for aligned DST stores. */
+ .align 4
+L(copy_GE_32_unaligned):
+ clrldi 0,0,60 /* Number of bytes until the 1st
+ quadword. */
+ andi. 11,3,15 /* Check alignment of DST (against
+ quadwords). */
+ srdi 9,5,4 /* Number of full quadwords remaining. */
+
+ beq L(copy_GE_32_unaligned_cont)
+
+ /* SRC is not quadword aligned, get it aligned. */
+
+ mtcrf 0x01,0
+ subf 31,0,5
+
+ /* Vector instructions work best when proper alignment (16-bytes)
+ is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
+1: /* Copy 1 byte. */
+ bf 31,2f
+
+ lbz 6,0(12)
+ addi 12,12,1
+ stb 6,0(3)
+ addi 3,3,1
+2: /* Copy 2 bytes. */
+ bf 30,4f
+
+ lhz 6,0(12)
+ addi 12,12,2
+ sth 6,0(3)
+ addi 3,3,2
+4: /* Copy 4 bytes. */
+ bf 29,8f
+
+ lwz 6,0(12)
+ addi 12,12,4
+ stw 6,0(3)
+ addi 3,3,4
+8: /* Copy 8 bytes. */
+ bf 28,0f
+
+ ld 6,0(12)
+ addi 12,12,8
+ std 6,0(3)
+ addi 3,3,8
+0:
+ clrldi 10,12,60 /* Check alignment of SRC. */
+ srdi 9,31,4 /* Number of full quadwords remaining. */
+
+ /* The proper alignment is present, it is OK to copy the bytes now. */
+L(copy_GE_32_unaligned_cont):
+
+ /* Setup two indexes to speed up the indexed vector operations. */
+ clrldi 11,31,60
+ li 6,16 /* Index for 16-bytes offsets. */
+ li 7,32 /* Index for 32-bytes offsets. */
+ cmpldi cr1,11,0
+ srdi 8,31,5 /* Setup the loop counter. */
+ mr 10,3
+ mr 11,12
+ mtcrf 0x01,9
+ cmpldi cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+ lvsr 5,0,12
+#else
+ lvsl 5,0,12
+#endif
+ lvx 3,0,12
+ bf 31,L(setup_unaligned_loop)
+
+ /* Copy another 16 bytes to align to 32-bytes due to the loop . */
+ lvx 4,12,6
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
+ addi 11,12,16
+ addi 10,3,16
+ stvx 6,0,3
+ vor 3,4,4
+
+L(setup_unaligned_loop):
+ mtctr 8
+ ble cr6,L(end_unaligned_loop)
+
+ /* Copy 32 bytes at a time using vector instructions. */
+ .align 4
+L(unaligned_loop):
+
+ /* Note: vr6/vr10 may contain data that was already copied,
+ but in order to get proper alignment, we may have to copy
+ some portions again. This is faster than having unaligned
+ vector instructions though. */
+
+ lvx 4,11,6 /* vr4 = r11+16. */
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
+ lvx 3,11,7 /* vr3 = r11+32. */
+#ifdef __LITTLE_ENDIAN__
+ vperm 10,3,4,5
+#else
+ vperm 10,4,3,5
+#endif
+ addi 11,11,32
+ stvx 6,0,10
+ stvx 10,10,6
+ addi 10,10,32
+
+ bdnz L(unaligned_loop)
+
+ .align 4
+L(end_unaligned_loop):
+
+ /* Check for tail bytes. */
+ rldicr 0,31,0,59
+ mtcrf 0x01,31
+ beq cr1,0f
+
+ add 3,3,0
+ add 12,12,0
+
+ /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
+8: /* Copy 8 bytes. */
+ bf 28,4f
+
+ lwz 6,0(12)
+ lwz 7,4(12)
+ addi 12,12,8
+ stw 6,0(3)
+ stw 7,4(3)
+ addi 3,3,8
+4: /* Copy 4 bytes. */
+ bf 29,2f
+
+ lwz 6,0(12)
+ addi 12,12,4
+ stw 6,0(3)
+ addi 3,3,4
+2: /* Copy 2~3 bytes. */
+ bf 30,1f
+
+ lhz 6,0(12)
+ addi 12,12,2
+ sth 6,0(3)
+ addi 3,3,2
+1: /* Copy 1 byte. */
+ bf 31,0f
+
+ lbz 6,0(12)
+ stb 6,0(3)
+0: /* Return DST + LEN pointer. */
+ ld 31,-8(1)
+ ld 3,-16(1)
+ add 3,3,5
+ blr
+
+END_GEN_TB (MEMPCPY,TB_TOCLESS)
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memrchr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memrchr.S
new file mode 100644
index 0000000000..4276768915
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memrchr.S
@@ -0,0 +1,201 @@
+/* Optimized memrchr implementation for PowerPC64/POWER7 using cmpb insn.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* int [r3] memrchr (char *s [r3], int byte [r4], int size [r5]) */
+
+#ifndef MEMRCHR
+# define MEMRCHR __memrchr
+#endif
+ .machine power7
+ENTRY (MEMRCHR)
+ CALL_MCOUNT 3
+ add r7,r3,r5 /* Calculate the last acceptable address. */
+ neg r0,r7
+ addi r7,r7,-1
+ mr r10,r3
+ clrrdi r6,r7,7
+ li r9,3<<5
+ dcbt r9,r6,8 /* Stream hint, decreasing addresses. */
+
+ /* Replicate BYTE to doubleword. */
+ insrdi r4,r4,8,48
+ insrdi r4,r4,16,32
+ insrdi r4,r4,32,0
+ li r6,-8
+ li r9,-1
+ rlwinm r0,r0,3,26,28 /* Calculate padding. */
+ clrrdi r8,r7,3
+ srd r9,r9,r0
+ cmpldi r5,32
+ clrrdi r0,r10,3
+ ble L(small_range)
+
+#ifdef __LITTLE_ENDIAN__
+ ldx r12,0,r8
+#else
+ ldbrx r12,0,r8 /* Load reversed doubleword from memory. */
+#endif
+ cmpb r3,r12,r4 /* Check for BYTE in DWORD1. */
+ and r3,r3,r9
+ cmpldi cr7,r3,0 /* If r3 == 0, no BYTEs have been found. */
+ bne cr7,L(done)
+
+ mtcrf 0x01,r8
+ /* Are we now aligned to a quadword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+ bf 28,L(loop_setup)
+
+ /* Handle DWORD2 of pair. */
+#ifdef __LITTLE_ENDIAN__
+ ldx r12,r8,r6
+#else
+ ldbrx r12,r8,r6
+#endif
+ addi r8,r8,-8
+ cmpb r3,r12,r4
+ cmpldi cr7,r3,0
+ bne cr7,L(done)
+
+L(loop_setup):
+ /* The last dword we want to read in the loop below is the one
+ containing the first byte of the string, ie. the dword at
+ s & ~7, or r0. The first dword read is at r8 - 8, we
+ read 2 * cnt dwords, so the last dword read will be at
+ r8 - 8 - 16 * cnt + 8. Solving for cnt gives
+ cnt = (r8 - r0) / 16 */
+ sub r5,r8,r0
+ addi r8,r8,-8
+ srdi r9,r5,4 /* Number of loop iterations. */
+ mtctr r9 /* Setup the counter. */
+
+ /* Main loop to look for BYTE backwards in the string.
+ FIXME: Investigate whether 32 byte align helps with this
+ 9 instruction loop. */
+ .align 5
+L(loop):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the byte-checking process for bigger strings. */
+
+#ifdef __LITTLE_ENDIAN__
+ ldx r12,0,r8
+ ldx r11,r8,r6
+#else
+ ldbrx r12,0,r8
+ ldbrx r11,r8,r6
+#endif
+ cmpb r3,r12,r4
+ cmpb r9,r11,r4
+ or r5,r9,r3 /* Merge everything in one doubleword. */
+ cmpldi cr7,r5,0
+ bne cr7,L(found)
+ addi r8,r8,-16
+ bdnz L(loop)
+
+ /* We may have one more word to read. */
+ cmpld r8,r0
+ bnelr
+
+#ifdef __LITTLE_ENDIAN__
+ ldx r12,0,r8
+#else
+ ldbrx r12,0,r8
+#endif
+ cmpb r3,r12,r4
+ cmpldi cr7,r3,0
+ bne cr7,L(done)
+ blr
+
+ .align 4
+L(found):
+ /* OK, one (or both) of the dwords contains BYTE. Check
+ the first dword. */
+ cmpldi cr6,r3,0
+ bne cr6,L(done)
+
+ /* BYTE must be in the second word. Adjust the address
+ again and move the result of cmpb to r3 so we can calculate the
+ pointer. */
+
+ mr r3,r9
+ addi r8,r8,-8
+
+ /* r3 has the output of the cmpb instruction, that is, it contains
+ 0xff in the same position as BYTE in the original
+ word from the string. Use that to calculate the pointer.
+ We need to make sure BYTE is *before* the end of the
+ range. */
+L(done):
+ cntlzd r9,r3 /* Count leading zeros before the match. */
+ cmpld r8,r0 /* Are we on the last word? */
+ srdi r6,r9,3 /* Convert leading zeros to bytes. */
+ addi r0,r6,-7
+ sub r3,r8,r0
+ cmpld cr7,r3,r10
+ bnelr
+ bgelr cr7
+ li r3,0
+ blr
+
+ .align 4
+L(null):
+ li r3,0
+ blr
+
+/* Deals with size <= 32. */
+ .align 4
+L(small_range):
+ cmpldi r5,0
+ beq L(null)
+
+#ifdef __LITTLE_ENDIAN__
+ ldx r12,0,r8
+#else
+ ldbrx r12,0,r8 /* Load reversed doubleword from memory. */
+#endif
+ cmpb r3,r12,r4 /* Check for BYTE in DWORD1. */
+ and r3,r3,r9
+ cmpldi cr7,r3,0
+ bne cr7,L(done)
+
+ /* Are we done already? */
+ cmpld r8,r0
+ addi r8,r8,-8
+ beqlr
+
+ .align 5
+L(loop_small):
+#ifdef __LITTLE_ENDIAN__
+ ldx r12,0,r8
+#else
+ ldbrx r12,0,r8
+#endif
+ cmpb r3,r12,r4
+ cmpld r8,r0
+ cmpldi cr7,r3,0
+ bne cr7,L(done)
+ addi r8,r8,-8
+ bne L(loop_small)
+ blr
+
+END (MEMRCHR)
+weak_alias (__memrchr, memrchr)
+libc_hidden_builtin_def (memrchr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memset.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memset.S
new file mode 100644
index 0000000000..21933c0672
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memset.S
@@ -0,0 +1,399 @@
+/* Optimized memset implementation for PowerPC64/POWER7.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+ Returns 's'. */
+
+#ifndef MEMSET
+# define MEMSET memset
+#endif
+ .machine power7
+EALIGN (MEMSET, 5, 0)
+ CALL_MCOUNT 3
+
+L(_memset):
+ cmpldi cr7,5,31
+ cmpldi cr6,5,8
+ mr 10,3
+
+ /* Replicate byte to word. */
+ insrdi 4,4,8,48
+ insrdi 4,4,16,32
+ ble cr6,L(small) /* If length <= 8, use short copy code. */
+
+ neg 0,3
+ ble cr7,L(medium) /* If length < 32, use medium copy code. */
+
+ andi. 11,10,7 /* Check alignment of SRC. */
+ insrdi 4,4,32,0 /* Replicate word to double word. */
+
+ mr 12,5
+ beq L(big_aligned)
+
+ clrldi 0,0,61
+ mtocrf 0x01,0
+ subf 5,0,5
+
+ /* Get DST aligned to 8 bytes. */
+1: bf 31,2f
+
+ stb 4,0(10)
+ addi 10,10,1
+2: bf 30,4f
+
+ sth 4,0(10)
+ addi 10,10,2
+4: bf 29,L(big_aligned)
+
+ stw 4,0(10)
+ addi 10,10,4
+
+ .align 4
+L(big_aligned):
+
+ cmpldi cr5,5,255
+ li 0,32
+ dcbtst 0,10
+ cmpldi cr6,4,0
+ srdi 9,5,3 /* Number of full doublewords remaining. */
+ crand 27,26,21
+ mtocrf 0x01,9
+ bt 27,L(huge)
+
+ /* From this point on, we'll copy 32+ bytes and the value
+ isn't 0 (so we can't use dcbz). */
+
+ srdi 8,5,5
+ clrldi 11,5,61
+ cmpldi cr6,11,0
+ cmpldi cr1,9,4
+ mtctr 8
+
+ /* Copy 1~3 doublewords so the main loop starts
+ at a multiple of 32 bytes. */
+
+ bf 30,1f
+
+ std 4,0(10)
+ std 4,8(10)
+ addi 10,10,16
+ bf 31,L(big_loop)
+
+ std 4,0(10)
+ addi 10,10,8
+ mr 12,10
+ blt cr1,L(tail_bytes)
+ b L(big_loop)
+
+ .align 4
+1: /* Copy 1 doubleword. */
+ bf 31,L(big_loop)
+
+ std 4,0(10)
+ addi 10,10,8
+
+ /* Main aligned copy loop. Copies 32-bytes at a time and
+ ping-pong through r10 and r12 to avoid AGEN delays. */
+ .align 4
+L(big_loop):
+ addi 12,10,32
+ std 4,0(10)
+ std 4,8(10)
+ std 4,16(10)
+ std 4,24(10)
+ bdz L(tail_bytes)
+
+ addi 10,10,64
+ std 4,0(12)
+ std 4,8(12)
+ std 4,16(12)
+ std 4,24(12)
+ bdnz L(big_loop)
+
+ mr 12,10
+ b L(tail_bytes)
+
+ .align 4
+L(tail_bytes):
+
+ /* Check for tail bytes. */
+ beqlr cr6
+
+ clrldi 0,5,61
+ mtocrf 0x01,0
+
+ /* At this point we have a tail of 0-7 bytes and we know that the
+ destination is doubleword-aligned. */
+4: /* Copy 4 bytes. */
+ bf 29,2f
+
+ stw 4,0(12)
+ addi 12,12,4
+2: /* Copy 2 bytes. */
+ bf 30,1f
+
+ sth 4,0(12)
+ addi 12,12,2
+1: /* Copy 1 byte. */
+ bflr 31
+
+ stb 4,0(12)
+ blr
+
+ /* Special case when value is 0 and we have a long length to deal
+ with. Use dcbz to zero out 128-bytes at a time. Before using
+ dcbz though, we need to get the destination 128-bytes aligned. */
+ .align 4
+L(huge):
+ andi. 11,10,127
+ neg 0,10
+ beq L(huge_aligned)
+
+ clrldi 0,0,57
+ subf 5,0,5
+ srdi 0,0,3
+ mtocrf 0x01,0
+
+ /* Get DST aligned to 128 bytes. */
+8: bf 28,4f
+
+ std 4,0(10)
+ std 4,8(10)
+ std 4,16(10)
+ std 4,24(10)
+ std 4,32(10)
+ std 4,40(10)
+ std 4,48(10)
+ std 4,56(10)
+ addi 10,10,64
+ .align 4
+4: bf 29,2f
+
+ std 4,0(10)
+ std 4,8(10)
+ std 4,16(10)
+ std 4,24(10)
+ addi 10,10,32
+ .align 4
+2: bf 30,1f
+
+ std 4,0(10)
+ std 4,8(10)
+ addi 10,10,16
+ .align 4
+1: bf 31,L(huge_aligned)
+
+ std 4,0(10)
+ addi 10,10,8
+
+
+L(huge_aligned):
+ srdi 8,5,7
+ clrldi 11,5,57
+ cmpldi cr6,11,0
+ mtctr 8
+
+ .align 4
+L(huge_loop):
+ dcbz 0,10
+ addi 10,10,128
+ bdnz L(huge_loop)
+
+ /* Check how many bytes are still left. */
+ beqlr cr6
+
+ subf 9,3,10
+ subf 5,9,12
+ srdi 8,5,3
+ cmpldi cr6,8,0
+ mtocrf 0x01,8
+
+ /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
+ speed. We'll handle the resulting tail bytes later. */
+ beq cr6,L(tail)
+
+8: bf 28,4f
+
+ std 4,0(10)
+ std 4,8(10)
+ std 4,16(10)
+ std 4,24(10)
+ std 4,32(10)
+ std 4,40(10)
+ std 4,48(10)
+ std 4,56(10)
+ addi 10,10,64
+ .align 4
+4: bf 29,2f
+
+ std 4,0(10)
+ std 4,8(10)
+ std 4,16(10)
+ std 4,24(10)
+ addi 10,10,32
+ .align 4
+2: bf 30,1f
+
+ std 4,0(10)
+ std 4,8(10)
+ addi 10,10,16
+ .align 4
+1: bf 31,L(tail)
+
+ std 4,0(10)
+ addi 10,10,8
+
+ /* Handle the rest of the tail bytes here. */
+L(tail):
+ mtocrf 0x01,5
+
+ .align 4
+4: bf 29,2f
+
+ stw 4,0(10)
+ addi 10,10,4
+ .align 4
+2: bf 30,1f
+
+ sth 4,0(10)
+ addi 10,10,2
+ .align 4
+1: bflr 31
+
+ stb 4,0(10)
+ blr
+
+ /* Expanded tree to copy tail bytes without increments. */
+ .align 4
+L(copy_tail):
+ bf 29,L(FXX)
+
+ stw 4,0(10)
+ bf 30,L(TFX)
+
+ sth 4,4(10)
+ bflr 31
+
+ stb 4,6(10)
+ blr
+
+ .align 4
+L(FXX): bf 30,L(FFX)
+
+ sth 4,0(10)
+ bflr 31
+
+ stb 4,2(10)
+ blr
+
+ .align 4
+L(TFX): bflr 31
+
+ stb 4,4(10)
+ blr
+
+ .align 4
+L(FFX): bflr 31
+
+ stb 4,0(10)
+ blr
+
+ /* Handle copies of 9~31 bytes. */
+ .align 4
+L(medium):
+ /* At least 9 bytes to go. */
+ andi. 11,10,3
+ clrldi 0,0,62
+ beq L(medium_aligned)
+
+ /* Force 4-bytes alignment for DST. */
+ mtocrf 0x01,0
+ subf 5,0,5
+1: /* Copy 1 byte. */
+ bf 31,2f
+
+ stb 4,0(10)
+ addi 10,10,1
+2: /* Copy 2 bytes. */
+ bf 30,L(medium_aligned)
+
+ sth 4,0(10)
+ addi 10,10,2
+
+ .align 4
+L(medium_aligned):
+ /* At least 6 bytes to go, and DST is word-aligned. */
+ cmpldi cr1,5,16
+ mtocrf 0x01,5
+ blt cr1,8f
+
+ /* Copy 16 bytes. */
+ stw 4,0(10)
+ stw 4,4(10)
+ stw 4,8(10)
+ stw 4,12(10)
+ addi 10,10,16
+8: /* Copy 8 bytes. */
+ bf 28,4f
+
+ stw 4,0(10)
+ stw 4,4(10)
+ addi 10,10,8
+4: /* Copy 4 bytes. */
+ bf 29,2f
+
+ stw 4,0(10)
+ addi 10,10,4
+2: /* Copy 2-3 bytes. */
+ bf 30,1f
+
+ sth 4,0(10)
+ addi 10,10,2
+1: /* Copy 1 byte. */
+ bflr 31
+
+ stb 4,0(10)
+ blr
+
+ /* Handles copies of 0~8 bytes. */
+ .align 4
+L(small):
+ mtocrf 0x01,5
+ bne cr6,L(copy_tail)
+
+ stw 4,0(10)
+ stw 4,4(10)
+ blr
+
+END_GEN_TB (MEMSET,TB_TOCLESS)
+libc_hidden_builtin_def (memset)
+
+/* Copied from bzero.S to prevent the linker from inserting a stub
+ between bzero and memset. */
+ENTRY (__bzero)
+ CALL_MCOUNT 3
+ mr r5,r4
+ li r4,0
+ b L(_memset)
+END (__bzero)
+#ifndef __bzero
+weak_alias (__bzero, bzero)
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/multiarch/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/multiarch/Implies
new file mode 100644
index 0000000000..bf5d6171a5
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/multiarch/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power6/multiarch
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/rawmemchr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/rawmemchr.S
new file mode 100644
index 0000000000..48afb75943
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/rawmemchr.S
@@ -0,0 +1,115 @@
+/* Optimized rawmemchr implementation for PowerPC64/POWER7 using cmpb insn.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* int [r3] rawmemchr (void *s [r3], int c [r4]) */
+
+#ifndef RAWMEMCHR
+# define RAWMEMCHR __rawmemchr
+#endif
+ .machine power7
+ENTRY (RAWMEMCHR)
+ CALL_MCOUNT 2
+ dcbt 0,r3
+ clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
+
+ /* Replicate byte to doubleword. */
+ insrdi r4,r4,8,48
+ insrdi r4,r4,16,32
+ insrdi r4,r4,32,0
+
+ /* Now r4 has a doubleword of c bytes. */
+
+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
+ ld r12,0(r8) /* Load doubleword from memory. */
+ cmpb r5,r12,r4 /* Compare each byte against c byte. */
+#ifdef __LITTLE_ENDIAN__
+ srd r5,r5,r6
+ sld r5,r5,r6
+#else
+ sld r5,r5,r6 /* Move left to discard ignored bits. */
+ srd r5,r5,r6 /* Bring the bits back as zeros. */
+#endif
+ cmpdi cr7,r5,0 /* If r5 == 0, no c bytes have been found. */
+ bne cr7,L(done)
+
+ mtcrf 0x01,r8
+
+ /* Are we now aligned to a quadword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bt 28,L(loop)
+
+ /* Handle DWORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r5,r12,r4
+ cmpdi cr7,r5,0
+ bne cr7,L(done)
+ b L(loop) /* We branch here (rather than falling through)
+ to skip the nops due to heavy alignment
+ of the loop below. */
+
+ /* Main loop to look for the end of the string. Since it's a
+ small loop (< 8 instructions), align it to 32-bytes. */
+ .p2align 5
+L(loop):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the byte-checking process for bigger strings. */
+ ld r12,8(r8)
+ ldu r11,16(r8)
+ cmpb r5,r12,r4
+ cmpb r6,r11,r4
+ or r7,r5,r6
+ cmpdi cr7,r7,0
+ beq cr7,L(loop)
+
+ /* OK, one (or both) of the doublewords contains a 'c' byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a c byte. */
+
+ cmpdi cr6,r5,0
+ addi r8,r8,-8
+ bne cr6,L(done)
+
+ /* The 'c' byte must be in the second doubleword. Adjust the address
+ again and move the result of cmpb to r10 so we can calculate the
+ pointer. */
+ mr r5,r6
+ addi r8,r8,8
+
+ /* r5 has the output of the cmpb instruction, that is, it contains
+ 0xff in the same position as the 'c' byte in the original
+ doubleword from the string. Use that fact to find out what is
+ the position of the byte inside the string. */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+ addi r0,r5,-1
+ andc r0,r0,r5
+ popcntd r0,r0 /* Count trailing zeros. */
+#else
+ cntlzd r0,r5 /* Count leading zeros before the match. */
+#endif
+ srdi r0,r0,3 /* Convert leading zeros to bytes. */
+ add r3,r8,r0 /* Return address of the matching char. */
+ blr
+END (RAWMEMCHR)
+weak_alias (__rawmemchr,rawmemchr)
+libc_hidden_builtin_def (__rawmemchr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/stpncpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/stpncpy.S
new file mode 100644
index 0000000000..a346dd7e28
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/stpncpy.S
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for PowerPC64/POWER7.
+ Copyright (C) 2014-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/power7/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcasecmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
new file mode 100644
index 0000000000..e856b8a593
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
@@ -0,0 +1,126 @@
+/* Optimized strcasecmp implementation for PowerPC64.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <locale-defines.h>
+
+/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] )
+
+ or if defined USE_IN_EXTENDED_LOCALE_MODEL:
+
+ int [r3] strcasecmp_l (const char *s1 [r3], const char *s2 [r4],
+ __locale_t loc [r5]) */
+
+#ifndef STRCMP
+# define __STRCMP __strcasecmp
+# define STRCMP strcasecmp
+#endif
+
+ENTRY (__STRCMP)
+#ifndef USE_IN_EXTENDED_LOCALE_MODEL
+ CALL_MCOUNT 2
+#else
+ CALL_MCOUNT 3
+#endif
+
+#define rRTN r3 /* Return value */
+#define rSTR1 r5 /* 1st string */
+#define rSTR2 r4 /* 2nd string */
+#define rLOCARG r5 /* 3rd argument: locale_t */
+#define rCHAR1 r6 /* Byte read from 1st string */
+#define rCHAR2 r7 /* Byte read from 2nd string */
+#define rADDR1 r8 /* Address of tolower(rCHAR1) */
+#define rADDR2 r12 /* Address of tolower(rCHAR2) */
+#define rLWR1 r8 /* Word tolower(rCHAR1) */
+#define rLWR2 r12 /* Word tolower(rCHAR2) */
+#define rTMP r9
+#define rLOC r11 /* Default locale address */
+
+ cmpd cr7, r3, r4
+#ifndef USE_IN_EXTENDED_LOCALE_MODEL
+ ld rTMP, __libc_tsd_LOCALE@got@tprel(r2)
+ add rLOC, rTMP, __libc_tsd_LOCALE@tls
+ ld rLOC, 0(rLOC)
+#else
+ mr rLOC, rLOCARG
+#endif
+ ld rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
+ mr rSTR1, rRTN
+ li rRTN, 0
+ beqlr cr7
+
+
+ /* Unrolling loop for POWER: loads are done with 'lbz' plus
+ offset and string descriptors are only updated in the end
+ of loop unrolling. */
+
+ lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
+ lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
+L(loop):
+ cmpdi rCHAR1, 0 /* *s1 == '\0' ? */
+ sldi rADDR1, rCHAR1, 2 /* Calculate address for tolower(*s1) */
+ sldi rADDR2, rCHAR2, 2 /* Calculate address for tolower(*s2) */
+ lwzx rLWR1, rLOC, rADDR1 /* Load tolower(*s1) */
+ lwzx rLWR2, rLOC, rADDR2 /* Load tolower(*s2) */
+ cmpw cr1, rLWR1, rLWR2 /* r = tolower(*s1) == tolower(*s2) ? */
+ crorc 4*cr1+eq,eq,4*cr1+eq /* (*s1 != '\0') || (r == 1) */
+ beq cr1, L(done)
+ lbz rCHAR1, 1(rSTR1)
+ lbz rCHAR2, 1(rSTR2)
+ cmpdi rCHAR1, 0
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpw cr1, rLWR1, rLWR2
+ crorc 4*cr1+eq,eq,4*cr1+eq
+ beq cr1, L(done)
+ lbz rCHAR1, 2(rSTR1)
+ lbz rCHAR2, 2(rSTR2)
+ cmpdi rCHAR1, 0
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpw cr1, rLWR1, rLWR2
+ crorc 4*cr1+eq,eq,4*cr1+eq
+ beq cr1, L(done)
+ lbz rCHAR1, 3(rSTR1)
+ lbz rCHAR2, 3(rSTR2)
+ cmpdi rCHAR1, 0
+ /* Increment both string descriptors */
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpw cr1, rLWR1, rLWR2
+ crorc 4*cr1+eq,eq,4*cr1+eq
+ beq cr1,L(done)
+ lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
+ lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
+ b L(loop)
+L(done):
+ subf r0, rLWR2, rLWR1
+ extsw rRTN, r0
+ blr
+END (__STRCMP)
+
+weak_alias (__STRCMP, STRCMP)
+libc_hidden_builtin_def (__STRCMP)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S
new file mode 100644
index 0000000000..c13c4ebcb8
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S
@@ -0,0 +1,5 @@
+#define USE_IN_EXTENDED_LOCALE_MODEL
+#define STRCMP strcasecmp_l
+#define __STRCMP __strcasecmp_l
+
+#include "strcasecmp.S"
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strchr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strchr.S
new file mode 100644
index 0000000000..a18e2e101c
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strchr.S
@@ -0,0 +1,230 @@
+/* Optimized strchr implementation for PowerPC64/POWER7 using cmpb insn.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifndef STRCHR
+# define STRCHR strchr
+#endif
+
+/* int [r3] strchr (char *s [r3], int c [r4]) */
+ .machine power7
+ENTRY (STRCHR)
+ CALL_MCOUNT 2
+ dcbt 0,r3
+ clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
+ cmpdi cr7,r4,0
+ ld r12,0(r8) /* Load doubleword from memory. */
+ li r0,0 /* Doubleword with null chars to use
+ with cmpb. */
+
+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
+
+ beq cr7,L(null_match)
+
+ /* Replicate byte to doubleword. */
+ insrdi r4,r4,8,48
+ insrdi r4,r4,16,32
+ insrdi r4,r4,32,0
+
+ /* Now r4 has a doubleword of c bytes and r0 has
+ a doubleword of null bytes. */
+
+ cmpb r10,r12,r4 /* Compare each byte against c byte. */
+ cmpb r11,r12,r0 /* Compare each byte against null byte. */
+
+ /* Move the doublewords left and right to discard the bits that are
+ not part of the string and bring them back as zeros. */
+#ifdef __LITTLE_ENDIAN__
+ srd r10,r10,r6
+ srd r11,r11,r6
+ sld r10,r10,r6
+ sld r11,r11,r6
+#else
+ sld r10,r10,r6
+ sld r11,r11,r6
+ srd r10,r10,r6
+ srd r11,r11,r6
+#endif
+ or r5,r10,r11 /* OR the results to speed things up. */
+ cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
+ have been found. */
+ bne cr7,L(done)
+
+ mtcrf 0x01,r8
+
+ /* Are we now aligned to a doubleword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bt 28,L(loop)
+
+ /* Handle WORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r10,r12,r4
+ cmpb r11,r12,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ bne cr7,L(done)
+ b L(loop) /* We branch here (rather than falling through)
+ to skip the nops due to heavy alignment
+ of the loop below. */
+
+ .p2align 5
+L(loop):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+ ld r12,8(r8)
+ ldu r9,16(r8)
+ cmpb r10,r12,r4
+ cmpb r11,r12,r0
+ cmpb r6,r9,r4
+ cmpb r7,r9,r0
+ or r12,r10,r11
+ or r9,r6,r7
+ or r5,r12,r9
+ cmpdi cr7,r5,0
+ beq cr7,L(loop)
+
+ /* OK, one (or both) of the doublewords contains a c/null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a c/null byte. */
+
+ cmpdi cr6,r12,0
+ addi r8,r8,-8
+ bne cr6,L(done)
+
+ /* The c/null byte must be in the second doubleword. Adjust the
+ address again and move the result of cmpb to r10 so we can calculate
+ the pointer. */
+
+ mr r10,r6
+ mr r11,r7
+ addi r8,r8,8
+
+ /* r10/r11 have the output of the cmpb instructions, that is,
+ 0xff in the same position as the c/null byte in the original
+ doubleword from the string. Use that to calculate the pointer. */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+ addi r3,r10,-1
+ andc r3,r3,r10
+ popcntd r0,r3
+ addi r4,r11,-1
+ andc r4,r4,r11
+ cmpld cr7,r3,r4
+ bgt cr7,L(no_match)
+#else
+ cntlzd r0,r10 /* Count leading zeros before c matches. */
+ cmpld cr7,r11,r10
+ bgt cr7,L(no_match)
+#endif
+ srdi r0,r0,3 /* Convert leading zeros to bytes. */
+ add r3,r8,r0 /* Return address of the matching c byte
+ or null in case c was not found. */
+ blr
+
+ .align 4
+L(no_match):
+ li r3,0
+ blr
+
+/* We are here because strchr was called with a null byte. */
+ .align 4
+L(null_match):
+ /* r0 has a doubleword of null bytes. */
+
+ cmpb r5,r12,r0 /* Compare each byte against null bytes. */
+
+ /* Move the doublewords left and right to discard the bits that are
+ not part of the string and bring them back as zeros. */
+#ifdef __LITTLE_ENDIAN__
+ srd r5,r5,r6
+ sld r5,r5,r6
+#else
+ sld r5,r5,r6
+ srd r5,r5,r6
+#endif
+ cmpdi cr7,r5,0 /* If r10 == 0, no c or null bytes
+ have been found. */
+ bne cr7,L(done_null)
+
+ mtcrf 0x01,r8
+
+ /* Are we now aligned to a quadword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bt 28,L(loop_null)
+
+ /* Handle WORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r5,r12,r0
+ cmpdi cr7,r5,0
+ bne cr7,L(done_null)
+ b L(loop_null) /* We branch here (rather than falling through)
+ to skip the nops due to heavy alignment
+ of the loop below. */
+
+ /* Main loop to look for the end of the string. Since it's a
+ small loop (< 8 instructions), align it to 32-bytes. */
+ .p2align 5
+L(loop_null):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+ ld r12,8(r8)
+ ldu r11,16(r8)
+ cmpb r5,r12,r0
+ cmpb r10,r11,r0
+ or r6,r5,r10
+ cmpdi cr7,r6,0
+ beq cr7,L(loop_null)
+
+ /* OK, one (or both) of the doublewords contains a null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a null byte. */
+
+ cmpdi cr6,r5,0
+ addi r8,r8,-8
+ bne cr6,L(done_null)
+
+ /* The null byte must be in the second doubleword. Adjust the address
+ again and move the result of cmpb to r10 so we can calculate the
+ pointer. */
+
+ mr r5,r10
+ addi r8,r8,8
+
+ /* r5 has the output of the cmpb instruction, that is, it contains
+ 0xff in the same position as the null byte in the original
+ doubleword from the string. Use that to calculate the pointer. */
+L(done_null):
+#ifdef __LITTLE_ENDIAN__
+ addi r0,r5,-1
+ andc r0,r0,r5
+ popcntd r0,r0
+#else
+ cntlzd r0,r5 /* Count leading zeros before the match. */
+#endif
+ srdi r0,r0,3 /* Convert leading zeros to bytes. */
+ add r3,r8,r0 /* Return address of the matching null byte. */
+ blr
+END (STRCHR)
+weak_alias (strchr, index)
+libc_hidden_builtin_def (strchr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strchrnul.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strchrnul.S
new file mode 100644
index 0000000000..27bc1f0682
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strchrnul.S
@@ -0,0 +1,131 @@
+/* Optimized strchrnul implementation for PowerPC64/POWER7 using cmpb insn.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifndef STRCHRNUL
+# define STRCHRNUL __strchrnul
+#endif
+/* int [r3] strchrnul (char *s [r3], int c [r4]) */
+ .machine power7
+ENTRY (STRCHRNUL)
+ CALL_MCOUNT 2
+ dcbt 0,r3
+ clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
+
+ /* Replicate byte to doubleword. */
+ insrdi r4,r4,8,48
+ insrdi r4,r4,16,32
+ insrdi r4,r4,32,0
+
+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
+ ld r12,0(r8) /* Load doubleword from memory. */
+ li r0,0 /* Doubleword with null chars to use
+ with cmpb. */
+
+ /* Now r4 has a doubleword of c bytes and r0 has
+ a doubleword of null bytes. */
+
+ cmpb r10,r12,r0 /* Compare each byte against c byte. */
+ cmpb r9,r12,r4 /* Compare each byte against null byte. */
+
+ /* Move the doublewords left and right to discard the bits that are
+ not part of the string and to bring them back as zeros. */
+#ifdef __LITTLE_ENDIAN__
+ srd r10,r10,r6
+ srd r9,r9,r6
+ sld r10,r10,r6
+ sld r9,r9,r6
+#else
+ sld r10,r10,r6
+ sld r9,r9,r6
+ srd r10,r10,r6
+ srd r9,r9,r6
+#endif
+ or r5,r9,r10 /* OR the results to speed things up. */
+ cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
+ have been found. */
+ bne cr7,L(done)
+
+ mtcrf 0x01,r8
+
+ /* Are we now aligned to a quadword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bt 28,L(loop)
+
+ /* Handle DWORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r10,r12,r0
+ cmpb r9,r12,r4
+ or r5,r9,r10
+ cmpdi cr7,r5,0
+ bne cr7,L(done)
+ b L(loop) /* We branch here (rather than falling through)
+ to skip the nops due to heavy alignment
+ of the loop below. */
+
+ .p2align 5
+L(loop):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+ ld r12,8(r8)
+ ldu r11,16(r8)
+ cmpb r10,r12,r0
+ cmpb r9,r12,r4
+ cmpb r6,r11,r0
+ cmpb r7,r11,r4
+ or r5,r9,r10
+ or r10,r6,r7
+ or r11,r5,r10
+ cmpdi cr7,r11,0
+ beq cr7,L(loop)
+
+ /* OK, one (or both) of the doublewords contains a c/null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a c/null byte. */
+
+ cmpdi cr6,r5,0
+ addi r8,r8,-8
+ bne cr6,L(done)
+
+ /* The c/null byte must be in the second doubleword. Adjust the
+ address again and move the result of cmpb to r5 so we can calculate
+ the pointer. */
+ mr r5,r10
+ addi r8,r8,8
+
+ /* r5 has the output of the cmpb instruction, that is, it contains
+ 0xff in the same position as the c/null byte in the original
+ doubleword from the string. Use that to calculate the pointer. */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+ addi r0,r5,-1
+ andc r0,r0,r5
+ popcntd r0,r0
+#else
+ cntlzd r0,r5 /* Count leading zeros before the match. */
+#endif
+ srdi r0,r0,3 /* Convert leading zeros to bytes. */
+ add r3,r8,r0 /* Return address of matching c/null byte. */
+ blr
+END (STRCHRNUL)
+weak_alias (STRCHRNUL, strchrnul)
+libc_hidden_builtin_def (STRCHRNUL)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcmp.S
new file mode 100644
index 0000000000..14e14f457e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcmp.S
@@ -0,0 +1,168 @@
+/* Optimized strcmp implementation for Power7 using 'cmpb' instruction
+ Copyright (C) 2014-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* The optimization is achieved here through cmpb instruction.
+ 8byte aligned strings are processed with double word comparision
+ and unaligned strings are handled effectively with loop unrolling
+ technique */
+
+#include <sysdep.h>
+
+#ifndef STRCMP
+# define STRCMP strcmp
+#endif
+
+/* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) */
+
+ .machine power7
+EALIGN (STRCMP, 4, 0)
+ CALL_MCOUNT 2
+
+ or r9, r3, r4
+ rldicl. r10, r9, 0, 61 /* are s1 and s2 8 byte aligned..? */
+ bne cr0, L(process_unaligned_bytes)
+ li r5, 0
+
+ .align 4
+/* process input parameters on double word aligned boundary */
+L(unrollDword):
+ ld r8,0(r3)
+ ld r10,0(r4)
+ cmpb r7,r8,r5
+ cmpdi cr7,r7,0
+ mr r9,r7
+ bne cr7,L(null_found)
+ cmpld cr7,r8,r10
+ bne cr7,L(different)
+
+ ld r8,8(r3)
+ ld r10,8(r4)
+ cmpb r7,r8,r5
+ cmpdi cr7,r7,0
+ mr r9,r7
+ bne cr7,L(null_found)
+ cmpld cr7,r8,r10
+ bne cr7,L(different)
+
+ ld r8,16(r3)
+ ld r10,16(r4)
+ cmpb r7,r8,r5
+ cmpdi cr7,r7,0
+ mr r9,r7
+ bne cr7,L(null_found)
+ cmpld cr7,r8,r10
+ bne cr7,L(different)
+
+ ld r8,24(r3)
+ ld r10,24(r4)
+ cmpb r7,r8,r5
+ cmpdi cr7,r7,0
+ mr r9,r7
+ bne cr7,L(null_found)
+ cmpld cr7,r8,r10
+ bne cr7,L(different)
+
+ addi r3, r3, 32
+ addi r4, r4, 32
+ beq cr7, L(unrollDword)
+
+ .align 4
+L(null_found):
+#ifdef __LITTLE_ENDIAN__
+ neg r7,r9
+ and r9,r9,r7
+ li r7,-1
+ cntlzd r9,r9
+ subfic r9,r9,71
+ sld r9,r7,r9
+#else
+ cntlzd r9,r9
+ li r7,-1
+ addi r9,r9,8
+ srd r9,r7,r9
+#endif
+ or r8,r8,r9
+ or r10,r10,r9
+
+L(different):
+ cmpb r9,r8,r10
+#ifdef __LITTLE_ENDIAN__
+ addi r7,r9,1
+ andc r9,r7,r9
+ cntlzd r9,r9
+ subfic r9,r9,63
+#else
+ not r9,r9
+ cntlzd r9,r9
+ subfic r9,r9,56
+#endif
+ srd r3,r8,r9
+ srd r10,r10,r9
+ rldicl r10,r10,0,56
+ rldicl r3,r3,0,56
+ subf r3,r10,r3
+ blr
+
+ .align 4
+L(process_unaligned_bytes):
+ lbz r9, 0(r3) /* load byte from s1 */
+ lbz r10, 0(r4) /* load byte from s2 */
+ cmpdi cr7, r9, 0 /* compare *s1 with NULL */
+ beq cr7, L(diffOfNULL) /* if *s1 is NULL , return *s1 - *s2 */
+ cmplw cr7, r9, r10 /* compare *s1 and *s2 */
+ bne cr7, L(ComputeDiff) /* branch to compute difference and return */
+
+ lbz r9, 1(r3) /* load next byte from s1 */
+ lbz r10, 1(r4) /* load next byte from s2 */
+ cmpdi cr7, r9, 0 /* compare *s1 with NULL */
+ beq cr7, L(diffOfNULL) /* if *s1 is NULL , return *s1 - *s2 */
+ cmplw cr7, r9, r10 /* compare *s1 and *s2 */
+ bne cr7, L(ComputeDiff) /* branch to compute difference and return */
+
+ lbz r9, 2(r3) /* unroll 3rd byte here */
+ lbz r10, 2(r4)
+ cmpdi cr7, r9, 0
+ beq cr7, L(diffOfNULL)
+ cmplw cr7, r9, r10
+ bne 7, L(ComputeDiff)
+
+ lbz r9, 3(r3) /* unroll 4th byte now */
+ lbz r10, 3(r4)
+ addi r3, r3, 4 /* increment s1 by unroll factor */
+ cmpdi cr7, r9, 0
+ cmplw cr6, 9, r10
+ beq cr7, L(diffOfNULL)
+ addi r4, r4, 4 /* increment s2 by unroll factor */
+ beq cr6, L(process_unaligned_bytes) /* unroll byte processing */
+
+ .align 4
+L(ComputeDiff):
+ extsw r9, r9
+ subf r10, r10, r9 /* compute s1 - s2 */
+ extsw r3, r10
+ blr /* return */
+
+ .align 4
+L(diffOfNULL):
+ li r9, 0
+ subf r10, r10, r9 /* compute s1 - s2 */
+ extsw r3, r10 /* sign extend result */
+ blr /* return */
+
+END (STRCMP)
+libc_hidden_builtin_def (strcmp)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strlen.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strlen.S
new file mode 100644
index 0000000000..63848c460c
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strlen.S
@@ -0,0 +1,107 @@
+/* Optimized strlen implementation for PowerPC64/POWER7 using cmpb insn.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* int [r3] strlen (char *s [r3]) */
+
+#ifndef STRLEN
+# define STRLEN strlen
+#endif
+ .machine power7
+ENTRY (STRLEN)
+ CALL_MCOUNT 1
+ dcbt 0,r3
+ clrrdi r4,r3,3 /* Align the address to doubleword boundary. */
+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
+ li r0,0 /* Doubleword with null chars to use
+ with cmpb. */
+ li r5,-1 /* MASK = 0xffffffffffffffff. */
+ ld r12,0(r4) /* Load doubleword from memory. */
+#ifdef __LITTLE_ENDIAN__
+ sld r5,r5,r6
+#else
+ srd r5,r5,r6 /* MASK = MASK >> padding. */
+#endif
+ orc r9,r12,r5 /* Mask bits that are not part of the string. */
+ cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */
+ cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
+ bne cr7,L(done)
+
+ mtcrf 0x01,r4
+
+ /* Are we now aligned to a quadword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bt 28,L(loop)
+
+ /* Handle DWORD2 of pair. */
+ ldu r12,8(r4)
+ cmpb r10,r12,r0
+ cmpdi cr7,r10,0
+ bne cr7,L(done)
+
+ /* Main loop to look for the end of the string. Since it's a
+ small loop (< 8 instructions), align it to 32-bytes. */
+ .p2align 5
+L(loop):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+
+ ld r12, 8(r4)
+ ldu r11, 16(r4)
+ cmpb r10,r12,r0
+ cmpb r9,r11,r0
+ or r8,r9,r10 /* Merge everything in one doubleword. */
+ cmpdi cr7,r8,0
+ beq cr7,L(loop)
+
+ /* OK, one (or both) of the doublewords contains a null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a null byte. */
+
+ cmpdi cr6,r10,0
+ addi r4,r4,-8
+ bne cr6,L(done)
+
+ /* The null byte must be in the second doubleword. Adjust the address
+ again and move the result of cmpb to r10 so we can calculate the
+ length. */
+
+ mr r10,r9
+ addi r4,r4,8
+
+ /* r10 has the output of the cmpb instruction, that is, it contains
+ 0xff in the same position as the null byte in the original
+ doubleword from the string. Use that to calculate the length. */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+ addi r9, r10, -1 /* Form a mask from trailing zeros. */
+ andc r9, r9, r10
+ popcntd r0, r9 /* Count the bits in the mask. */
+#else
+ cntlzd r0,r10 /* Count leading zeros before the match. */
+#endif
+ subf r5,r3,r4
+ srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
+ add r3,r5,r0 /* Compute final length. */
+ blr
+END (STRLEN)
+libc_hidden_builtin_def (strlen)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncmp.S
new file mode 100644
index 0000000000..d53b31be8e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncmp.S
@@ -0,0 +1,227 @@
+/* Optimized strcmp implementation for POWER7/PowerPC64.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifndef STRNCMP
+# define STRNCMP strncmp
+#endif
+
+/* See strlen.s for comments on how the end-of-string testing works. */
+
+/* int [r3] strncmp (const char *s1 [r3],
+ const char *s2 [r4],
+ size_t size [r5]) */
+
+EALIGN (STRNCMP,5,0)
+ CALL_MCOUNT 3
+
+#define rTMP2 r0
+#define rRTN r3
+#define rSTR1 r3 /* first string arg */
+#define rSTR2 r4 /* second string arg */
+#define rN r5 /* max string length */
+#define rWORD1 r6 /* current word in s1 */
+#define rWORD2 r7 /* current word in s2 */
+#define rWORD3 r10
+#define rWORD4 r11
+#define rFEFE r8 /* constant 0xfefefefefefefeff (-0x0101010101010101) */
+#define r7F7F r9 /* constant 0x7f7f7f7f7f7f7f7f */
+#define rNEG r10 /* ~(word in s1 | 0x7f7f7f7f7f7f7f7f) */
+#define rBITDIF r11 /* bits that differ in s1 & s2 words */
+#define rTMP r12
+
+ dcbt 0,rSTR1
+ nop
+ or rTMP,rSTR2,rSTR1
+ lis r7F7F,0x7f7f
+ dcbt 0,rSTR2
+ nop
+ clrldi. rTMP,rTMP,61
+ cmpldi cr1,rN,0
+ lis rFEFE,-0x101
+ bne L(unaligned)
+/* We are doubleword aligned so set up for two loops. first a double word
+ loop, then fall into the byte loop if any residual. */
+ srdi. rTMP,rN,3
+ clrldi rN,rN,61
+ addi rFEFE,rFEFE,-0x101
+ addi r7F7F,r7F7F,0x7f7f
+ cmpldi cr1,rN,0
+ beq L(unaligned)
+
+ mtctr rTMP
+ ld rWORD1,0(rSTR1)
+ ld rWORD2,0(rSTR2)
+ sldi rTMP,rFEFE,32
+ insrdi r7F7F,r7F7F,32,0
+ add rFEFE,rFEFE,rTMP
+ b L(g1)
+
+L(g0):
+ ldu rWORD1,8(rSTR1)
+ bne cr1,L(different)
+ ldu rWORD2,8(rSTR2)
+L(g1): add rTMP,rFEFE,rWORD1
+ nor rNEG,r7F7F,rWORD1
+ bdz L(tail)
+ and. rTMP,rTMP,rNEG
+ cmpd cr1,rWORD1,rWORD2
+ beq L(g0)
+
+/* OK. We've hit the end of the string. We need to be careful that
+ we don't compare two strings as different because of gunk beyond
+ the end of the strings... */
+
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+ addi rTMP2, rTMP, -1
+ beq cr1, L(equal)
+ andc rTMP2, rTMP2, rTMP
+ rldimi rTMP2, rTMP2, 1, 0
+ and rWORD2, rWORD2, rTMP2 /* Mask off gunk. */
+ and rWORD1, rWORD1, rTMP2
+ cmpd cr1, rWORD1, rWORD2
+ beq cr1, L(equal)
+ cmpb rBITDIF, rWORD1, rWORD2 /* 0xff on equal bytes. */
+ addi rNEG, rBITDIF, 1
+ orc rNEG, rNEG, rBITDIF /* 0's below LS differing byte. */
+ sldi rNEG, rNEG, 8 /* 1's above LS differing byte. */
+ andc rWORD1, rWORD1, rNEG /* mask off MS bytes. */
+ andc rWORD2, rWORD2, rNEG
+ xor. rBITDIF, rWORD1, rWORD2
+ sub rRTN, rWORD1, rWORD2
+ blt L(highbit)
+ sradi rRTN, rRTN, 63 /* must return an int. */
+ ori rRTN, rRTN, 1
+ blr
+L(equal):
+ li rRTN, 0
+ blr
+
+L(different):
+ ld rWORD1, -8(rSTR1)
+ cmpb rBITDIF, rWORD1, rWORD2 /* 0xff on equal bytes. */
+ addi rNEG, rBITDIF, 1
+ orc rNEG, rNEG, rBITDIF /* 0's below LS differing byte. */
+ sldi rNEG, rNEG, 8 /* 1's above LS differing byte. */
+ andc rWORD1, rWORD1, rNEG /* mask off MS bytes. */
+ andc rWORD2, rWORD2, rNEG
+ xor. rBITDIF, rWORD1, rWORD2
+ sub rRTN, rWORD1, rWORD2
+ blt L(highbit)
+ sradi rRTN, rRTN, 63
+ ori rRTN, rRTN, 1
+ blr
+L(highbit):
+ sradi rRTN, rWORD2, 63
+ ori rRTN, rRTN, 1
+ blr
+
+#else
+L(endstring):
+ and rTMP,r7F7F,rWORD1
+ beq cr1,L(equal)
+ add rTMP,rTMP,r7F7F
+ xor. rBITDIF,rWORD1,rWORD2
+ andc rNEG,rNEG,rTMP
+ blt L(highbit)
+ cntlzd rBITDIF,rBITDIF
+ cntlzd rNEG,rNEG
+ addi rNEG,rNEG,7
+ cmpd cr1,rNEG,rBITDIF
+ sub rRTN,rWORD1,rWORD2
+ blt cr1,L(equal)
+ sradi rRTN,rRTN,63 /* must return an int. */
+ ori rRTN,rRTN,1
+ blr
+L(equal):
+ li rRTN,0
+ blr
+
+L(different):
+ ld rWORD1,-8(rSTR1)
+ xor. rBITDIF,rWORD1,rWORD2
+ sub rRTN,rWORD1,rWORD2
+ blt L(highbit)
+ sradi rRTN,rRTN,63
+ ori rRTN,rRTN,1
+ blr
+L(highbit):
+ sradi rRTN,rWORD2,63
+ ori rRTN,rRTN,1
+ blr
+#endif
+
+/* Oh well. In this case, we just do a byte-by-byte comparison. */
+ .align 4
+L(tail):
+ and. rTMP,rTMP,rNEG
+ cmpd cr1,rWORD1,rWORD2
+ bne L(endstring)
+ addi rSTR1,rSTR1,8
+ bne cr1,L(different)
+ addi rSTR2,rSTR2,8
+ cmpldi cr1,rN,0
+L(unaligned):
+ mtctr rN
+ ble cr1,L(ux)
+L(uz):
+ lbz rWORD1,0(rSTR1)
+ lbz rWORD2,0(rSTR2)
+ .align 4
+L(u1):
+ cmpdi cr1,rWORD1,0
+ bdz L(u4)
+ cmpd rWORD1,rWORD2
+ beq cr1,L(u4)
+ bne L(u4)
+ lbzu rWORD3,1(rSTR1)
+ lbzu rWORD4,1(rSTR2)
+ cmpdi cr1,rWORD3,0
+ bdz L(u3)
+ cmpd rWORD3,rWORD4
+ beq cr1,L(u3)
+ bne L(u3)
+ lbzu rWORD1,1(rSTR1)
+ lbzu rWORD2,1(rSTR2)
+ cmpdi cr1,rWORD1,0
+ bdz L(u4)
+ cmpd rWORD1,rWORD2
+ beq cr1,L(u4)
+ bne L(u4)
+ lbzu rWORD3,1(rSTR1)
+ lbzu rWORD4,1(rSTR2)
+ cmpdi cr1,rWORD3,0
+ bdz L(u3)
+ cmpd rWORD3,rWORD4
+ beq cr1,L(u3)
+ bne L(u3)
+ lbzu rWORD1,1(rSTR1)
+ lbzu rWORD2,1(rSTR2)
+ b L(u1)
+
+L(u3): sub rRTN,rWORD3,rWORD4
+ blr
+L(u4): sub rRTN,rWORD1,rWORD2
+ blr
+L(ux):
+ li rRTN,0
+ blr
+END (STRNCMP)
+libc_hidden_builtin_def (strncmp)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncpy.S
new file mode 100644
index 0000000000..0224f74898
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncpy.S
@@ -0,0 +1,722 @@
+/* Copyright (C) 2014-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Implements the functions
+
+ char * [r3] strncpy (char *dst [r3], const char *src [r4], size_t n [r5])
+
+ AND
+
+ char * [r3] stpncpy (char *dst [r3], const char *src [r4], size_t n [r5])
+
+ The algorithm is as follows:
+ > if src and dest are 8 byte aligned, perform double word copy
+ else
+ > copy byte by byte on unaligned addresses.
+
+ The aligned comparison are made using cmpb instructions. */
+
+/* The focus on optimization for performance improvements are as follows:
+ 1. data alignment [gain from aligned memory access on read/write]
+ 2. POWER7 gains performance with loop unrolling/unwinding
+ [gain by reduction of branch penalty].
+ 3. The final pad with null bytes is done by calling an optimized
+ memset. */
+
+#ifdef USE_AS_STPNCPY
+# ifndef STPNCPY
+# define FUNC_NAME __stpncpy
+# else
+# define FUNC_NAME STPNCPY
+# endif
+#else
+# ifndef STRNCPY
+# define FUNC_NAME strncpy
+# else
+# define FUNC_NAME STRNCPY
+# endif
+#endif /* !USE_AS_STPNCPY */
+
+#define FRAMESIZE (FRAME_MIN_SIZE+32)
+
+#ifndef MEMSET
+/* For builds with no IFUNC support, local calls should be made to internal
+ GLIBC symbol (created by libc_hidden_builtin_def). */
+# ifdef SHARED
+# define MEMSET __GI_memset
+# else
+# define MEMSET memset
+# endif
+#endif
+
+ .machine power7
+EALIGN(FUNC_NAME, 4, 0)
+ CALL_MCOUNT 3
+
+ mflr r0 /* load link register LR to r0 */
+ or r10, r3, r4 /* to verify source and destination */
+ rldicl. r8, r10, 0, 61 /* is double word aligned .. ? */
+
+ std r19, -8(r1) /* save callers register , r19 */
+ std r18, -16(r1) /* save callers register , r18 */
+ std r0, 16(r1) /* store the link register */
+ stdu r1, -FRAMESIZE(r1) /* create the stack frame */
+
+ mr r9, r3 /* save r3 into r9 for use */
+ mr r18, r3 /* save r3 for retCode of strncpy */
+ bne 0, L(unaligned)
+
+L(aligned):
+ srdi r11, r5, 3 /* compute count for CTR ; count = n/8 */
+ cmpldi cr7, r11, 3 /* if count > 4 ; perform unrolling 4 times */
+ ble 7, L(update1)
+
+ ld r10, 0(r4) /* load doubleWord from src */
+ cmpb r8, r10, r8 /* compare src with NULL ,we read just now */
+ cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */
+ bne cr7, L(update3)
+
+ std r10, 0(r3) /* copy doubleword at offset=0 */
+ ld r10, 8(r4) /* load next doubleword from offset=8 */
+ cmpb r8, r10, r8 /* compare src with NULL , we read just now */
+ cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */
+ bne 7,L(HopBy8)
+
+ addi r8, r11, -4
+ mr r7, r3
+ srdi r8, r8, 2
+ mr r6, r4
+ addi r8, r8, 1
+ li r12, 0
+ mtctr r8
+ b L(dwordCopy)
+
+ .p2align 4
+L(dWordUnroll):
+ std r8, 16(r9)
+ ld r8, 24(r4) /* load dword,perform loop unrolling again */
+ cmpb r10, r8, r10
+ cmpdi cr7, r10, 0
+ bne cr7, L(HopBy24)
+
+ std r8, 24(r7) /* copy dword at offset=24 */
+ addi r9, r9, 32
+ addi r4, r4, 32
+ bdz L(leftDwords) /* continue with loop on counter */
+
+ ld r3, 32(r6)
+ cmpb r8, r3, r10
+ cmpdi cr7, r8, 0
+ bne cr7, L(update2)
+
+ std r3, 32(r7)
+ ld r10, 40(r6)
+ cmpb r8, r10, r8
+ cmpdi cr7, r8, 0
+ bne cr7, L(HopBy40)
+
+ mr r6, r4 /* update values */
+ mr r7, r9
+ mr r11, r0
+ mr r5, r19
+
+L(dwordCopy):
+ std r10, 8(r9) /* copy dword at offset=8 */
+ addi r19, r5, -32
+ addi r0, r11, -4
+ ld r8, 16(r4)
+ cmpb r10, r8, r12
+ cmpdi cr7, r10, 0
+ beq cr7, L(dWordUnroll)
+
+ addi r9, r9, 16 /* increment dst by 16 */
+ addi r4, r4, 16 /* increment src by 16 */
+ addi r5, r5, -16 /* decrement length 'n' by 16 */
+ addi r0, r11, -2 /* decrement loop counter */
+
+L(dWordUnrollOFF):
+ ld r10, 0(r4) /* load first dword */
+ li r8, 0 /* load mask */
+ cmpb r8, r10, r8
+ cmpdi cr7, r8, 0
+ bne cr7, L(byte_by_byte)
+ mtctr r0
+ li r7, 0
+ b L(CopyDword)
+
+ .p2align 4
+L(loadDWordandCompare):
+ ld r10, 0(r4)
+ cmpb r8, r10, r7
+ cmpdi cr7, r8, 0
+ bne cr7, L(byte_by_byte)
+
+L(CopyDword):
+ addi r9, r9, 8
+ std r10, -8(r9)
+ addi r4, r4, 8
+ addi r5, r5, -8
+ bdnz L(loadDWordandCompare)
+
+L(byte_by_byte):
+ cmpldi cr7, r5, 3
+ ble cr7, L(verifyByte)
+ srdi r10, r5, 2
+ mr r19, r9
+ mtctr r10
+ b L(firstByteUnroll)
+
+ .p2align 4
+L(bytes_unroll):
+ lbz r10, 1(r4) /* load byte from src */
+ cmpdi cr7, r10, 0 /* compare for NULL */
+ stb r10, 1(r19) /* store byte to dst */
+ beq cr7, L(updtDestComputeN2ndByte)
+
+ addi r4, r4, 4 /* advance src */
+
+ lbz r10, -2(r4) /* perform loop unrolling for byte r/w */
+ cmpdi cr7, r10, 0
+ stb r10, 2(r19)
+ beq cr7, L(updtDestComputeN3rdByte)
+
+ lbz r10, -1(r4) /* perform loop unrolling for byte r/w */
+ addi r19, r19, 4
+ cmpdi cr7, r10, 0
+ stb r10, -1(r19)
+ beq cr7, L(ComputeNByte)
+
+ bdz L(update0)
+
+L(firstByteUnroll):
+ lbz r10, 0(r4) /* perform loop unrolling for byte r/w */
+ cmpdi cr7, 10, 0
+ stb r10, 0(r19)
+ bne cr7, L(bytes_unroll)
+ addi r19, r19, 1
+
+L(ComputeNByte):
+ subf r9, r19, r9 /* compute 'n'n bytes to fill */
+ add r8, r9, r5
+
+L(zeroFill):
+ cmpdi cr7, r8, 0 /* compare if length is zero */
+ beq cr7, L(update3return)
+
+ mr r3, r19 /* fill buffer with */
+ li r4, 0 /* zero fill buffer */
+ mr r5, r8 /* how many bytes to fill buffer with */
+ bl MEMSET /* call optimized memset */
+ nop
+
+L(update3return):
+#ifdef USE_AS_STPNCPY
+ addi r3, r19, -1 /* update return value */
+#endif
+
+L(hop2return):
+#ifndef USE_AS_STPNCPY
+ mr r3, r18 /* set return value */
+#endif
+ addi r1, r1, FRAMESIZE /* restore stack pointer */
+ ld r0, 16(r1) /* read the saved link register */
+ ld r18, -16(r1) /* restore callers save register, r18 */
+ ld r19, -8(r1) /* restore callers save register, r19 */
+ mtlr r0 /* branch to link register */
+ blr /* return */
+
+ .p2align 4
+L(update0):
+ mr r9, r19
+
+ .p2align 4
+L(verifyByte):
+ rldicl. r8, r5, 0, 62
+#ifdef USE_AS_STPNCPY
+ mr r3, r9
+#endif
+ beq cr0, L(hop2return)
+ mtctr r8
+ addi r4, r4, -1
+ mr r19, r9
+ b L(oneBYone)
+
+ .p2align 4
+L(proceed):
+ bdz L(done)
+
+L(oneBYone):
+ lbzu r10, 1(r4) /* copy byte */
+ addi r19, r19, 1
+ addi r8, r8, -1
+ cmpdi cr7, r10, 0
+ stb r10, -1(r19)
+ bne cr7, L(proceed)
+ b L(zeroFill)
+
+ .p2align 4
+L(done):
+ addi r1, r1, FRAMESIZE /* restore stack pointer */
+#ifdef USE_AS_STPNCPY
+ mr r3, r19 /* set the return value */
+#else
+ mr r3, r18 /* set the return value */
+#endif
+ ld r0, 16(r1) /* read the saved link register */
+ ld r18, -16(r1) /* restore callers save register, r18 */
+ ld r19, -8(r1) /* restore callers save register, r19 */
+ mtlr r0 /* branch to link register */
+ blr /* return */
+
+L(update1):
+ mr r0, r11
+ mr r19, r5
+
+ .p2align 4
+L(leftDwords):
+ cmpdi cr7, r0, 0
+ mr r5, r19
+ bne cr7, L(dWordUnrollOFF)
+ b L(byte_by_byte)
+
+ .p2align 4
+L(updtDestComputeN2ndByte):
+ addi r19, r19, 2 /* update dst by 2 */
+ subf r9, r19, r9 /* compute distance covered */
+ add r8, r9, r5
+ b L(zeroFill)
+
+ .p2align 4
+L(updtDestComputeN3rdByte):
+ addi r19, r19, 3 /* update dst by 3 */
+ subf r9, r19, r9 /* compute distance covered */
+ add r8, r9, r5
+ b L(zeroFill)
+
+ .p2align 4
+L(HopBy24):
+ addi r9, r9, 24 /* increment dst by 24 */
+ addi r4, r4, 24 /* increment src by 24 */
+ addi r5, r5, -24 /* decrement length 'n' by 24 */
+ addi r0, r11, -3 /* decrement loop counter */
+ b L(dWordUnrollOFF)
+
+ .p2align 4
+L(update2):
+ mr r5, r19
+ b L(dWordUnrollOFF)
+
+ .p2align 4
+L(HopBy40):
+ addi r9, r7, 40 /* increment dst by 40 */
+ addi r4, r6, 40 /* increment src by 40 */
+ addi r5, r5, -40 /* decrement length 'n' by 40 */
+ addi r0, r11, -5 /* decrement loop counter */
+ b L(dWordUnrollOFF)
+
+L(update3):
+ mr r0, r11
+ b L(dWordUnrollOFF)
+
+L(HopBy8):
+ addi r9, r3, 8 /* increment dst by 8 */
+ addi r4, r4, 8 /* increment src by 8 */
+ addi r5, r5, -8 /* decrement length 'n' by 8 */
+ addi r0, r11, -1 /* decrement loop counter */
+ b L(dWordUnrollOFF)
+
+L(unaligned):
+ cmpdi r5, 16 /* Proceed byte by byte for less than 16 */
+ ble L(byte_by_byte)
+ rldicl r7, r3, 0, 61
+ rldicl r6, r4, 0, 61
+ cmpdi r6, 0 /* Check src alignment */
+ beq L(srcaligndstunalign)
+ /* src is unaligned */
+ rlwinm r10, r4, 3,26,28 /* Calculate padding. */
+ clrrdi r4, r4, 3 /* Align the addr to dw boundary */
+ ld r8, 0(r4) /* Load doubleword from memory. */
+ li r0, 0
+ /* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+ srd r7, r8, r10
+#else
+ sld r7, r8, r10
+#endif
+ cmpb r0, r7, r0 /* Compare each byte against null */
+ /* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+ sld r0, r0, r10
+#else
+ srd r0, r0, r10
+#endif
+ cmpdi r0, 0
+ bne L(bytebybyte) /* if it has null, copy byte by byte */
+ subfic r6, r6, 8
+ rlwinm r12, r3, 3,26,28 /* Calculate padding in bits. */
+ rldicl r9, r3, 0, 61 /* Calculate padding in bytes. */
+ addi r3, r3, -1
+
+ cmpdi r12, 0 /* check dest alignment */
+ beq L(srcunaligndstalign)
+
+ /* both src and dst unaligned */
+#ifdef __LITTLE_ENDIAN__
+ sld r8, r7, r10
+ mr r11, r10
+ addi r11, r11, -8 /* Adjust byte pointer on loaded dw */
+#else
+ srd r8, r7, r10
+ subfic r11, r10, 64
+#endif
+ /* dst alignment is greater then src alignment? */
+ cmpd cr7, r12, r10
+ ble cr7, L(dst_align_small)
+ /* src alignment is less than dst */
+
+ /* Calculate the dst alignment difference */
+ subfic r7, r9, 8
+ mtctr r7
+
+ /* Write until dst is aligned */
+ cmpdi r0, r7, 4
+ blt L(storebyte1) /* less than 4, store byte by byte */
+ beq L(equal1) /* if its 4, store word */
+ addi r0, r7, -4 /* greater than 4, so stb and stw */
+ mtctr r0
+L(storebyte1):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8 /* Adjust byte pointer on loaded dw */
+#else
+ addi r11, r11, -8
+#endif
+ srd r7, r8, r11
+ stbu r7, 1(r3)
+ addi r5, r5, -1
+ bdnz L(storebyte1)
+
+ subfic r7, r9, 8 /* Check the remaining bytes */
+ cmpdi r0, r7, 4
+ blt L(proceed1)
+
+ .align 4
+L(equal1):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8 /* Adjust byte pointer on loaded dw */
+ srd r7, r8, r11
+#else
+ subfic r11, r11, 64
+ sld r7, r8, r11
+ srdi r7, r7, 32
+#endif
+ stw r7, 1(r3)
+ addi r3, r3, 4
+ addi r5, r5, -4
+
+L(proceed1):
+ mr r7, r8
+ /* calculate the Left over bytes to be written */
+ subfic r11, r10, 64
+ subfic r12, r12, 64
+ subf r12, r12, r11 /* remaining bytes on second dw */
+ subfic r10, r12, 64 /* remaining bytes on first dw */
+ subfic r9, r9, 8
+ subf r6, r9, r6 /* recalculate padding */
+L(srcunaligndstalign):
+ addi r3, r3, 1
+ subfic r12, r10, 64 /* remaining bytes on second dw */
+ addi r4, r4, 8
+ li r0,0
+ b L(storedouble)
+
+ .align 4
+L(dst_align_small):
+ mtctr r6
+ /* Write until src is aligned */
+L(storebyte2):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8 /* Adjust byte pointer on dw */
+#else
+ addi r11, r11, -8
+#endif
+ srd r7, r8, r11
+ stbu r7, 1(r3)
+ addi r5, r5, -1
+ bdnz L(storebyte2)
+
+ addi r4, r4, 8 /* Increment src pointer */
+ addi r3, r3, 1 /* Increment dst pointer */
+ mr r9, r3
+ li r8, 0
+ cmpd cr7, r12, r10
+ beq cr7, L(aligned)
+ rldicl r6, r3, 0, 61 /* Recalculate padding */
+ mr r7, r6
+
+ /* src is algined */
+L(srcaligndstunalign):
+ mr r9, r3
+ mr r6, r7
+ ld r8, 0(r4)
+ subfic r10, r7, 8
+ mr r7, r8
+ li r0, 0 /* Check null */
+ cmpb r0, r8, r0
+ cmpdi r0, 0
+ bne L(byte_by_byte) /* Do byte by byte if there is NULL */
+ rlwinm r12, r3, 3,26,28 /* Calculate padding */
+ addi r3, r3, -1
+ /* write byte by byte until aligned */
+#ifdef __LITTLE_ENDIAN__
+ li r11, -8
+#else
+ li r11, 64
+#endif
+ mtctr r10
+ cmpdi r0, r10, 4
+ blt L(storebyte)
+ beq L(equal)
+ addi r0, r10, -4
+ mtctr r0
+L(storebyte):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8 /* Adjust byte pointer on dw */
+#else
+ addi r11, r11, -8
+#endif
+ srd r7, r8, r11
+ stbu r7, 1(r3)
+ addi r5, r5, -1
+ bdnz L(storebyte)
+
+ cmpdi r0, r10, 4
+ blt L(align)
+
+ .align 4
+L(equal):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8
+ srd r7, r8, r11
+#else
+ subfic r11, r11, 64
+ sld r7, r8, r11
+ srdi r7, r7, 32
+#endif
+ stw r7, 1(r3)
+ addi r5, r5, -4
+ addi r3, r3, 4
+L(align):
+ addi r3, r3, 1
+ addi r4, r4, 8 /* Increment src pointer */
+ subfic r10, r12, 64
+ li r0, 0
+ /* dst addr aligned to 8 */
+L(storedouble):
+ cmpdi r5, 8
+ ble L(null1)
+ ld r7, 0(r4) /* load next dw */
+ cmpb r0, r7, r0
+ cmpdi r0, 0 /* check for null on each new dw */
+ bne L(null)
+#ifdef __LITTLE_ENDIAN__
+ srd r9, r8, r10 /* bytes from first dw */
+ sld r11, r7, r12 /* bytes from second dw */
+#else
+ sld r9, r8, r10
+ srd r11, r7, r12
+#endif
+ or r11, r9, r11 /* make as a single dw */
+ std r11, 0(r3) /* store as std on aligned addr */
+ mr r8, r7 /* still few bytes left to be written */
+ addi r3, r3, 8 /* increment dst addr */
+ addi r4, r4, 8 /* increment src addr */
+ addi r5, r5, -8
+ b L(storedouble) /* Loop until NULL */
+
+ .align 4
+
+/* We've hit the end of the string. Do the rest byte-by-byte. */
+L(null):
+ addi r3, r3, -1
+ mr r10, r12
+ mtctr r6
+#ifdef __LITTLE_ENDIAN__
+ subfic r10, r10, 64
+ addi r10, r10, -8
+#endif
+ cmpdi r0, r5, 4
+ blt L(loop)
+ cmpdi r0, r6, 4
+ blt L(loop)
+
+ /* we can still use stw if leftover >= 4 */
+#ifdef __LITTLE_ENDIAN__
+ addi r10, r10, 8
+ srd r11, r8, r10
+#else
+ subfic r10, r10, 64
+ sld r11, r8, r10
+ srdi r11, r11, 32
+#endif
+ stw r11, 1(r3)
+ addi r5, r5, -4
+ addi r3, r3, 4
+ cmpdi r0, r5, 0
+ beq L(g1)
+ cmpdi r0, r6, 4
+ beq L(bytebybyte1)
+ addi r10, r10, 32
+#ifdef __LITTLE_ENDIAN__
+ addi r10, r10, -8
+#else
+ subfic r10, r10, 64
+#endif
+ addi r0, r6, -4
+ mtctr r0
+ /* remaining byte by byte part of first dw */
+L(loop):
+#ifdef __LITTLE_ENDIAN__
+ addi r10, r10, 8
+#else
+ addi r10, r10, -8
+#endif
+ srd r0, r8, r10
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ cmpdi r0, r5, 0
+ beq L(g1)
+ bdnz L(loop)
+L(bytebybyte1):
+ addi r3, r3, 1
+ /* remaining byte by byte part of second dw */
+L(bytebybyte):
+ addi r3, r3, -8
+ addi r4, r4, -1
+
+#ifdef __LITTLE_ENDIAN__
+ extrdi. r0, r7, 8, 56
+ stbu r7, 8(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 48
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 40
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 32
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 24
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 16
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 8
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi r0, r7, 8, 0
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ b L(g2)
+#else
+ extrdi. r0, r7, 8, 0
+ stbu r0, 8(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 8
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 16
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 24
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 32
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 40
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 48
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ stbu r7, 1(r3)
+ addi r5, r5, -1
+ b L(g2)
+#endif
+L(g1):
+#ifdef USE_AS_STPNCPY
+ addi r3, r3, 1
+#endif
+L(g2):
+ addi r3, r3, 1
+ mr r19, r3
+ mr r8, r5
+ b L(zeroFill)
+L(null1):
+ mr r9, r3
+ subf r4, r6, r4
+ b L(byte_by_byte)
+END(FUNC_NAME)
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strnlen.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strnlen.S
new file mode 100644
index 0000000000..a970b6ce30
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strnlen.S
@@ -0,0 +1,182 @@
+/* Optimized strnlen implementation for PowerPC64/POWER7 using cmpb insn.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifndef STRNLEN
+# define STRNLEN __strnlen
+#endif
+
+/* int [r3] strnlen (char *s [r3], int size [r4]) */
+ .machine power7
+ENTRY (STRNLEN)
+ CALL_MCOUNT 2
+ dcbt 0,r3
+ clrrdi r8,r3,3
+ add r7,r3,r4 /* Calculate the last acceptable address. */
+ cmpldi r4,32
+ li r0,0 /* Doubleword with null chars. */
+ addi r7,r7,-1
+
+ /* If we have less than 33 bytes to search, skip to a faster code. */
+ ble L(small_range)
+
+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
+ ld r12,0(r8) /* Load doubleword from memory. */
+ cmpb r10,r12,r0 /* Check for null bytes in DWORD1. */
+#ifdef __LITTLE_ENDIAN__
+ srd r10,r10,r6
+ sld r10,r10,r6
+#else
+ sld r10,r10,r6
+ srd r10,r10,r6
+#endif
+ cmpldi cr7,r10,0 /* If r10 == 0, no null's have been found. */
+ bne cr7,L(done)
+
+ clrrdi r7,r7,3 /* Address of last doubleword. */
+ mtcrf 0x01,r8
+ /* Are we now aligned to a quadword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bt 28,L(loop_setup)
+
+ /* Handle DWORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r10,r12,r0
+ cmpldi cr7,r10,0
+ bne cr7,L(done)
+
+L(loop_setup):
+ /* The last dword we want to read in the loop below is the one
+ containing the last byte of the string, ie. the dword at
+ (s + size - 1) & ~7, or r7. The first dword read is at
+ r8 + 8, we read 2 * cnt dwords, so the last dword read will
+ be at r8 + 8 + 16 * cnt - 8. Solving for cnt gives
+ cnt = (r7 - r8) / 16 */
+ sub r5,r7,r8
+ srdi r6,r5,4 /* Number of loop iterations. */
+ mtctr r6 /* Setup the counter. */
+
+ /* Main loop to look for the null byte in the string. Since
+ it's a small loop (< 8 instructions), align it to 32-bytes. */
+ .p2align 5
+L(loop):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+
+ ld r12,8(r8)
+ ldu r11,16(r8)
+ cmpb r10,r12,r0
+ cmpb r9,r11,r0
+ or r5,r9,r10 /* Merge everything in one doubleword. */
+ cmpldi cr7,r5,0
+ bne cr7,L(found)
+ bdnz L(loop)
+
+ /* We may have one more dword to read. */
+ cmpld cr6,r8,r7
+ beq cr6,L(end_max)
+
+ ldu r12,8(r8)
+ cmpb r10,r12,r0
+ cmpldi cr6,r10,0
+ bne cr6,L(done)
+
+L(end_max):
+ mr r3,r4
+ blr
+
+ /* OK, one (or both) of the doublewords contains a null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a null byte. */
+ .align 4
+L(found):
+ cmpldi cr6,r10,0
+ addi r8,r8,-8
+ bne cr6,L(done)
+
+ /* The null byte must be in the second doubleword. Adjust the address
+ again and move the result of cmpb to r10 so we can calculate the
+ length. */
+
+ mr r10,r9
+ addi r8,r8,8
+
+ /* r10 has the output of the cmpb instruction, that is, it contains
+ 0xff in the same position as the null byte in the original
+ doubleword from the string. Use that to calculate the length.
+ We need to make sure the null char is *before* the end of the
+ range. */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+ addi r0,r10,-1
+ andc r0,r0,r10
+ popcntd r0,r0
+#else
+ cntlzd r0,r10 /* Count leading zeros before the match. */
+#endif
+ sub r3,r8,r3
+ srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
+ add r3,r3,r0 /* Length until the match. */
+ cmpld r3,r4
+ blelr
+ mr r3,r4
+ blr
+
+/* Deals with size <= 32. */
+ .align 4
+L(small_range):
+ cmpldi r4,0
+ beq L(end_max)
+
+ clrrdi r7,r7,3 /* Address of last doubleword. */
+
+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
+ ld r12,0(r8) /* Load doubleword from memory. */
+ cmpb r10,r12,r0 /* Check for null bytes in DWORD1. */
+#ifdef __LITTLE_ENDIAN__
+ srd r10,r10,r6
+ sld r10,r10,r6
+#else
+ sld r10,r10,r6
+ srd r10,r10,r6
+#endif
+ cmpldi cr7,r10,0
+ bne cr7,L(done)
+
+ cmpld r8,r7
+ beq L(end_max)
+
+ .p2align 5
+L(loop_small):
+ ldu r12,8(r8)
+ cmpb r10,r12,r0
+ cmpldi cr6,r10,0
+ bne cr6,L(done)
+ cmpld r8,r7
+ bne L(loop_small)
+ mr r3,r4
+ blr
+
+END (STRNLEN)
+libc_hidden_def (__strnlen)
+weak_alias (__strnlen, strnlen)
+libc_hidden_def (strnlen)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strrchr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strrchr.S
new file mode 100644
index 0000000000..c22393deb5
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strrchr.S
@@ -0,0 +1,260 @@
+/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn.
+ Copyright (C) 2014-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* int [r3] strrchr (char *s [r3], int c [r4]) */
+
+#ifndef STRRCHR
+# define STRRCHR strrchr
+#endif
+
+ .machine power7
+ENTRY (STRRCHR)
+ CALL_MCOUNT 2
+ dcbt 0,r3
+ clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
+ cmpdi cr7,r4,0
+ ld r12,0(r8) /* Load doubleword from memory. */
+ li r9,0 /* used to store last occurence */
+ li r0,0 /* Doubleword with null chars to use
+ with cmpb. */
+
+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
+
+ beq cr7,L(null_match)
+
+ /* Replicate byte to doubleword. */
+ insrdi r4,r4,8,48
+ insrdi r4,r4,16,32
+ insrdi r4,r4,32,0
+
+ /* r4 is changed now ,if its passed as more chars
+ check for null again */
+ cmpdi cr7,r4,0
+ beq cr7,L(null_match)
+ /* Now r4 has a doubleword of c bytes and r0 has
+ a doubleword of null bytes. */
+
+ cmpb r10,r12,r4 /* Compare each byte against c byte. */
+ cmpb r11,r12,r0 /* Compare each byte against null byte. */
+
+ /* Move the doublewords left and right to discard the bits that are
+ not part of the string and bring them back as zeros. */
+#ifdef __LITTLE_ENDIAN__
+ srd r10,r10,r6
+ srd r11,r11,r6
+ sld r10,r10,r6
+ sld r11,r11,r6
+#else
+ sld r10,r10,r6
+ sld r11,r11,r6
+ srd r10,r10,r6
+ srd r11,r11,r6
+#endif
+ or r5,r10,r11 /* OR the results to speed things up. */
+ cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
+ have been found. */
+ bne cr7,L(done)
+
+L(align):
+ mtcrf 0x01,r8
+
+ /* Are we now aligned to a doubleword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bt 28,L(loop)
+
+ /* Handle WORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r10,r12,r4
+ cmpb r11,r12,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ bne cr7,L(done)
+ b L(loop) /* We branch here (rather than falling through)
+ to skip the nops due to heavy alignment
+ of the loop below. */
+ .p2align 5
+L(loop):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+ ld r12,8(r8)
+ ldu r7,16(r8)
+ cmpb r10,r12,r4
+ cmpb r11,r12,r0
+ cmpb r6,r7,r4
+ cmpb r7,r7,r0
+ or r12,r10,r11
+ or r5,r6,r7
+ or r5,r12,r5
+ cmpdi cr7,r5,0
+ beq cr7,L(loop)
+
+ /* OK, one (or both) of the doublewords contains a c/null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a c/null byte. */
+ cmpdi cr6,r12,0
+ addi r8,r8,-8
+ bne cr6,L(done)
+
+ /* The c/null byte must be in the second doubleword. Adjust the
+ address again and move the result of cmpb to r10 so we can calculate
+ the pointer. */
+
+ mr r10,r6
+ mr r11,r7
+ addi r8,r8,8
+
+ /* r10/r11 have the output of the cmpb instructions, that is,
+ 0xff in the same position as the c/null byte in the original
+ doubleword from the string. Use that to calculate the pointer. */
+
+L(done):
+ /* if there are more than one 0xff in r11, find the first pos of ff
+ in r11 and fill r10 with 0 from that position */
+ cmpdi cr7,r11,0
+ beq cr7,L(no_null)
+#ifdef __LITTLE_ENDIAN__
+ addi r3,r11,-1
+ andc r3,r3,r11
+ popcntd r0,r3
+#else
+ cntlzd r0,r11
+#endif
+ subfic r0,r0,63
+ li r6,-1
+#ifdef __LITTLE_ENDIAN__
+ srd r0,r6,r0
+#else
+ sld r0,r6,r0
+#endif
+ and r10,r0,r10
+L(no_null):
+#ifdef __LITTLE_ENDIAN__
+ cntlzd r0,r10 /* Count leading zeros before c matches. */
+ addi r3,r10,-1
+ andc r3,r3,r10
+ addi r10,r11,-1
+ andc r10,r10,r11
+ cmpld cr7,r3,r10
+ bgt cr7,L(no_match)
+#else
+ addi r3,r10,-1 /* Count trailing zeros before c matches. */
+ andc r3,r3,r10
+ popcntd r0,r3
+ cmpld cr7,r11,r10
+ bgt cr7,L(no_match)
+#endif
+ srdi r0,r0,3 /* Convert trailing zeros to bytes. */
+ subfic r0,r0,7
+ add r9,r8,r0 /* Return address of the matching c byte
+ or null in case c was not found. */
+ li r0,0
+ cmpdi cr7,r11,0 /* If r11 == 0, no null's have been found. */
+ beq cr7,L(align)
+
+ .align 4
+L(no_match):
+ mr r3,r9
+ blr
+
+/* We are here because strrchr was called with a null byte. */
+ .align 4
+L(null_match):
+ /* r0 has a doubleword of null bytes. */
+
+ cmpb r5,r12,r0 /* Compare each byte against null bytes. */
+
+ /* Move the doublewords left and right to discard the bits that are
+ not part of the string and bring them back as zeros. */
+#ifdef __LITTLE_ENDIAN__
+ srd r5,r5,r6
+ sld r5,r5,r6
+#else
+ sld r5,r5,r6
+ srd r5,r5,r6
+#endif
+ cmpdi cr7,r5,0 /* If r10 == 0, no c or null bytes
+ have been found. */
+ bne cr7,L(done_null)
+
+ mtcrf 0x01,r8
+
+ /* Are we now aligned to a quadword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bt 28,L(loop_null)
+
+ /* Handle WORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r5,r12,r0
+ cmpdi cr7,r5,0
+ bne cr7,L(done_null)
+ b L(loop_null) /* We branch here (rather than falling through)
+ to skip the nops due to heavy alignment
+ of the loop below. */
+
+ /* Main loop to look for the end of the string. Since it's a
+ small loop (< 8 instructions), align it to 32-bytes. */
+ .p2align 5
+L(loop_null):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+ ld r12,8(r8)
+ ldu r11,16(r8)
+ cmpb r5,r12,r0
+ cmpb r10,r11,r0
+ or r6,r5,r10
+ cmpdi cr7,r6,0
+ beq cr7,L(loop_null)
+
+ /* OK, one (or both) of the doublewords contains a null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a null byte. */
+
+ cmpdi cr6,r5,0
+ addi r8,r8,-8
+ bne cr6,L(done_null)
+
+ /* The null byte must be in the second doubleword. Adjust the address
+ again and move the result of cmpb to r10 so we can calculate the
+ pointer. */
+
+ mr r5,r10
+ addi r8,r8,8
+
+ /* r5 has the output of the cmpb instruction, that is, it contains
+ 0xff in the same position as the null byte in the original
+ doubleword from the string. Use that to calculate the pointer. */
+L(done_null):
+#ifdef __LITTLE_ENDIAN__
+ addi r0,r5,-1
+ andc r0,r0,r5
+ popcntd r0,r0
+#else
+ cntlzd r0,r5 /* Count leading zeros before the match. */
+#endif
+ srdi r0,r0,3 /* Convert trailing zeros to bytes. */
+ add r3,r8,r0 /* Return address of the matching null byte. */
+ blr
+END (STRRCHR)
+weak_alias (strrchr, rindex)
+libc_hidden_builtin_def (strrchr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strstr-ppc64.c b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strstr-ppc64.c
new file mode 100644
index 0000000000..a917b2157e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strstr-ppc64.c
@@ -0,0 +1,27 @@
+/* Optimized strstr implementation for PowerPC64/POWER7.
+ Copyright (C) 2015-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <string.h>
+
+#define STRSTR __strstr_ppc
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(__name)
+
+extern __typeof (strstr) __strstr_ppc attribute_hidden;
+
+#include <string/strstr.c>
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strstr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strstr.S
new file mode 100644
index 0000000000..260db2ed6d
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strstr.S
@@ -0,0 +1,521 @@
+/* Optimized strstr implementation for PowerPC64/POWER7.
+ Copyright (C) 2015-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Char * [r3] strstr (char *s [r3], char * pat[r4]) */
+
+/* The performance gain is obtained using aligned memory access, load
+ * doubleword and usage of cmpb instruction for quicker comparison. */
+
+#define ITERATIONS 64
+
+#ifndef STRSTR
+# define STRSTR strstr
+#endif
+
+#ifndef STRLEN
+/* For builds with no IFUNC support, local calls should be made to internal
+ GLIBC symbol (created by libc_hidden_builtin_def). */
+# ifdef SHARED
+# define STRLEN __GI_strlen
+# else
+# define STRLEN strlen
+# endif
+#endif
+
+#ifndef STRNLEN
+/* For builds with no IFUNC support, local calls should be made to internal
+ GLIBC symbol (created by libc_hidden_builtin_def). */
+# ifdef SHARED
+# define STRNLEN __GI_strnlen
+# else
+# define STRNLEN __strnlen
+# endif
+#endif
+
+#ifndef STRCHR
+# ifdef SHARED
+# define STRCHR __GI_strchr
+# else
+# define STRCHR strchr
+# endif
+#endif
+
+#define FRAMESIZE (FRAME_MIN_SIZE+32)
+ .machine power7
+EALIGN (STRSTR, 4, 0)
+ CALL_MCOUNT 2
+ mflr r0 /* Load link register LR to r0. */
+ std r31, -8(r1) /* Save callers register r31. */
+ std r30, -16(r1) /* Save callers register r30. */
+ std r29, -24(r1) /* Save callers register r29. */
+ std r28, -32(r1) /* Save callers register r28. */
+ std r0, 16(r1) /* Store the link register. */
+ cfi_offset(r31, -8)
+ cfi_offset(r30, -16)
+ cfi_offset(r28, -32)
+ cfi_offset(r29, -24)
+ cfi_offset(lr, 16)
+ stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */
+ cfi_adjust_cfa_offset(FRAMESIZE)
+
+ dcbt 0, r3
+ dcbt 0, r4
+ cmpdi cr7, r3, 0
+ beq cr7, L(retnull)
+ cmpdi cr7, r4, 0
+ beq cr7, L(retnull)
+
+ mr r29, r3
+ mr r30, r4
+ mr r3, r4
+ bl STRLEN
+ nop
+
+ cmpdi cr7, r3, 0 /* If search str is null. */
+ beq cr7, L(ret_r3)
+
+ mr r31, r3
+ mr r4, r3
+ mr r3, r29
+ bl STRNLEN
+ nop
+
+ cmpd cr7, r3, r31 /* If len(r3) < len(r4). */
+ blt cr7, L(retnull)
+ mr r3, r29
+ lbz r4, 0(r30)
+ bl STRCHR
+ nop
+
+ mr r11, r3
+ /* If first char of search str is not present. */
+ cmpdi cr7, r3, 0
+ ble cr7, L(end)
+ /* Reg r28 is used to count the number of iterations. */
+ li r28, 0
+ rldicl r8, r3, 0, 52 /* Page cross check. */
+ cmpldi cr7, r8, 4096-16
+ bgt cr7, L(bytebybyte)
+
+ rldicl r8, r30, 0, 52
+ cmpldi cr7, r8, 4096-16
+ bgt cr7, L(bytebybyte)
+
+ /* If len(r4) < 8 handle in a different way. */
+ /* Shift position based on null and use cmpb. */
+ cmpdi cr7, r31, 8
+ blt cr7, L(lessthan8)
+
+ /* Len(r4) >= 8 reaches here. */
+ mr r8, r3 /* Save r3 for future use. */
+ mr r4, r30 /* Restore r4. */
+ li r0, 0
+ rlwinm r10, r30, 3, 26, 28 /* Calculate padding in bits. */
+ clrrdi r4, r4, 3 /* Make r4 aligned to 8. */
+ ld r6, 0(r4)
+ addi r4, r4, 8
+ cmpdi cr7, r10, 0 /* Check if its already aligned? */
+ beq cr7, L(begin1)
+#ifdef __LITTLE_ENDIAN__
+ srd r6, r6, r10 /* Discard unwanted bits. */
+#else
+ sld r6, r6, r10
+#endif
+ ld r9, 0(r4)
+ subfic r10, r10, 64
+#ifdef __LITTLE_ENDIAN__
+ sld r9, r9, r10 /* Discard unwanted bits. */
+#else
+ srd r9, r9, r10
+#endif
+ or r6, r6, r9 /* Form complete search str. */
+L(begin1):
+ mr r29, r6
+ rlwinm r10, r3, 3, 26, 28
+ clrrdi r3, r3, 3
+ ld r5, 0(r3)
+ cmpb r9, r0, r6 /* Check if input has null. */
+ cmpdi cr7, r9, 0
+ bne cr7, L(return3)
+ cmpb r9, r0, r5 /* Check if input has null. */
+#ifdef __LITTLE_ENDIAN__
+ srd r9, r9, r10
+#else
+ sld r9, r9, r10
+#endif
+ cmpdi cr7, r9, 0
+ bne cr7, L(retnull)
+
+ li r12, -8 /* Shift values. */
+ li r11, 72 /* Shift values. */
+ cmpdi cr7, r10, 0
+ beq cr7, L(nextbyte1)
+ mr r12, r10
+ addi r12, r12, -8
+ subfic r11, r12, 64
+
+L(nextbyte1):
+ ldu r7, 8(r3) /* Load next dw. */
+ addi r12, r12, 8 /* Shift one byte and compare. */
+ addi r11, r11, -8
+#ifdef __LITTLE_ENDIAN__
+ srd r9, r5, r12 /* Rotate based on mask. */
+ sld r10, r7, r11
+#else
+ sld r9, r5, r12
+ srd r10, r7, r11
+#endif
+ /* Form single dw from few bytes on first load and second load. */
+ or r10, r9, r10
+ /* Check for null in the formed dw. */
+ cmpb r9, r0, r10
+ cmpdi cr7, r9, 0
+ bne cr7, L(retnull)
+ /* Cmpb search str and input str. */
+ cmpb r9, r10, r6
+ cmpdi cr7, r9, -1
+ beq cr7, L(match)
+ addi r8, r8, 1
+ b L(begin)
+
+ .align 4
+L(match):
+ /* There is a match of 8 bytes, check next bytes. */
+ cmpdi cr7, r31, 8
+ beq cr7, L(return)
+ /* Update next starting point r8. */
+ srdi r9, r11, 3
+ subf r9, r9, r3
+ mr r8, r9
+
+L(secondmatch):
+ mr r5, r7
+ rlwinm r10, r30, 3, 26, 28 /* Calculate padding in bits. */
+ ld r6, 0(r4)
+ addi r4, r4, 8
+ cmpdi cr7, r10, 0 /* Check if its already aligned? */
+ beq cr7, L(proceed3)
+#ifdef __LITTLE_ENDIAN__
+ srd r6, r6, r10 /* Discard unwanted bits. */
+ cmpb r9, r0, r6
+ sld r9, r9, r10
+#else
+ sld r6, r6, r10
+ cmpb r9, r0, r6
+ srd r9, r9, r10
+#endif
+ cmpdi cr7, r9, 0
+ bne cr7, L(proceed3)
+ ld r9, 0(r4)
+ subfic r10, r10, 64
+#ifdef __LITTLE_ENDIAN__
+ sld r9, r9, r10 /* Discard unwanted bits. */
+#else
+ srd r9, r9, r10
+#endif
+ or r6, r6, r9 /* Form complete search str. */
+
+L(proceed3):
+ li r7, 0
+ addi r3, r3, 8
+ cmpb r9, r0, r5
+ cmpdi cr7, r9, 0
+ bne cr7, L(proceed4)
+ ld r7, 0(r3)
+L(proceed4):
+#ifdef __LITTLE_ENDIAN__
+ srd r9, r5, r12
+ sld r10, r7, r11
+#else
+ sld r9, r5, r12
+ srd r10, r7, r11
+#endif
+ /* Form single dw with few bytes from first and second load. */
+ or r10, r9, r10
+ cmpb r9, r0, r6
+ cmpdi cr7, r9, 0
+ bne cr7, L(return4)
+ /* Check for null in the formed dw. */
+ cmpb r9, r0, r10
+ cmpdi cr7, r9, 0
+ bne cr7, L(retnull)
+ /* If the next 8 bytes dont match, start search again. */
+ cmpb r9, r10, r6
+ cmpdi cr7, r9, -1
+ bne cr7, L(reset)
+ /* If the next 8 bytes match, load and compare next 8. */
+ b L(secondmatch)
+
+ .align 4
+L(reset):
+ /* Start the search again. */
+ addi r8, r8, 1
+ b L(begin)
+
+ .align 4
+L(return3):
+ /* Count leading zeros and compare partial dw. */
+#ifdef __LITTLE_ENDIAN__
+ addi r7, r9, -1
+ andc r7, r7, r9
+ popcntd r7, r7
+ subfic r7, r7, 64
+ sld r10, r5, r7
+ sld r6, r6, r7
+#else
+ cntlzd r7, r9
+ subfic r7, r7, 64
+ srd r10, r5, r7
+ srd r6, r6, r7
+#endif
+ cmpb r9, r10, r6
+ cmpdi cr7, r9, -1
+ addi r8, r8, 1
+ /* Start search again if there is no match. */
+ bne cr7, L(begin)
+ /* If the words match, update return values. */
+ subfic r7, r7, 64
+ srdi r7, r7, 3
+ add r3, r3, r7
+ subf r3, r31, r3
+ b L(end)
+
+ .align 4
+L(return4):
+ /* Count leading zeros and compare partial dw. */
+#ifdef __LITTLE_ENDIAN__
+ addi r7, r9, -1
+ andc r7, r7, r9
+ popcntd r7, r7
+ subfic r7, r7, 64
+ sld r10, r10, r7
+ sld r6, r6, r7
+#else
+ cntlzd r7, r9
+ subfic r7, r7, 64
+ srd r10, r10, r7
+ srd r6, r6, r7
+#endif
+ cmpb r9, r10, r6
+ cmpdi cr7, r9, -1
+ addi r8, r8, 1
+ bne cr7, L(begin)
+ subfic r7, r7, 64
+ srdi r11, r11, 3
+ subf r3, r11, r3
+ srdi r7, r7, 3
+ add r3, r3, r7
+ subf r3, r31, r3
+ b L(end)
+
+ .align 4
+L(begin):
+ mr r3, r8
+ /* When our iterations exceed ITERATIONS,fall back to default. */
+ addi r28, r28, 1
+ cmpdi cr7, r28, ITERATIONS
+ beq cr7, L(default)
+ lbz r4, 0(r30)
+ bl STRCHR
+ nop
+ /* If first char of search str is not present. */
+ cmpdi cr7, r3, 0
+ ble cr7, L(end)
+ mr r8, r3
+ mr r4, r30 /* Restore r4. */
+ li r0, 0
+ mr r6, r29
+ clrrdi r4, r4, 3
+ addi r4, r4, 8
+ b L(begin1)
+
+ /* Handle less than 8 search string. */
+ .align 4
+L(lessthan8):
+ mr r4, r3
+ mr r9, r30
+ li r0, 0
+
+ rlwinm r10, r9, 3, 26, 28 /* Calculate padding in bits. */
+ srdi r8, r10, 3 /* Padding in bytes. */
+ clrrdi r9, r9, 3 /* Make r4 aligned to 8. */
+ ld r6, 0(r9)
+ cmpdi cr7, r10, 0 /* Check if its already aligned? */
+ beq cr7, L(proceed2)
+#ifdef __LITTLE_ENDIAN__
+ srd r6, r6, r10 /* Discard unwanted bits. */
+#else
+ sld r6, r6, r10
+#endif
+ subfic r8, r8, 8
+ cmpd cr7, r8, r31 /* Next load needed? */
+ bge cr7, L(proceed2)
+ ld r7, 8(r9)
+ subfic r10, r10, 64
+#ifdef __LITTLE_ENDIAN__
+ sld r7, r7, r10 /* Discard unwanted bits. */
+#else
+ srd r7, r7, r10
+#endif
+ or r6, r6, r7 /* Form complete search str. */
+L(proceed2):
+ mr r29, r6
+ rlwinm r10, r3, 3, 26, 28
+ clrrdi r7, r3, 3 /* Make r3 aligned. */
+ ld r5, 0(r7)
+ sldi r8, r31, 3
+ subfic r8, r8, 64
+#ifdef __LITTLE_ENDIAN__
+ sld r6, r6, r8
+ cmpb r9, r0, r5
+ srd r9, r9, r10
+#else
+ srd r6, r6, r8
+ cmpb r9, r0, r5
+ sld r9, r9, r10
+#endif
+ cmpdi cr7, r9, 0
+ bne cr7, L(noload)
+ cmpdi cr7, r10, 0
+ beq cr7, L(continue)
+ ld r7, 8(r7)
+L(continue1):
+ mr r12, r10
+ addi r12, r12, -8
+ subfic r11, r12, 64
+ b L(nextbyte)
+
+ .align 4
+L(continue):
+ ld r7, 8(r7)
+ li r12, -8 /* Shift values. */
+ li r11, 72 /* Shift values. */
+L(nextbyte):
+ addi r12, r12, 8 /* Mask for rotation. */
+ addi r11, r11, -8
+#ifdef __LITTLE_ENDIAN__
+ srd r9, r5, r12
+ sld r10, r7, r11
+ or r10, r9, r10
+ sld r10, r10, r8
+ cmpb r9, r0, r10
+ srd r9, r9, r8
+#else
+ sld r9, r5, r12
+ srd r10, r7, r11
+ or r10, r9, r10
+ srd r10, r10, r8
+ cmpb r9, r0, r10
+ sld r9, r9, r8
+#endif
+ cmpdi cr7, r9, 0
+ bne cr7, L(retnull)
+ cmpb r9, r10, r6
+ cmpdi cr7, r9, -1
+ beq cr7, L(end)
+ addi r3, r4, 1
+ /* When our iterations exceed ITERATIONS,fall back to default. */
+ addi r28, r28, 1
+ cmpdi cr7, r28, ITERATIONS
+ beq cr7, L(default)
+ lbz r4, 0(r30)
+ bl STRCHR
+ nop
+ /* If first char of search str is not present. */
+ cmpdi cr7, r3, 0
+ ble cr7, L(end)
+ mr r4, r3
+ mr r6, r29
+ li r0, 0
+ b L(proceed2)
+
+ .align 4
+L(noload):
+ /* Reached null in r3, so skip next load. */
+ li r7, 0
+ b L(continue1)
+
+ .align 4
+L(return):
+ /* Update return values. */
+ srdi r9, r11, 3
+ subf r3, r9, r3
+ b L(end)
+
+ /* Handling byte by byte. */
+ .align 4
+L(bytebybyte):
+ mr r8, r3
+ addi r8, r8, -1
+L(loop1):
+ addi r8, r8, 1
+ mr r3, r8
+ mr r4, r30
+ lbz r6, 0(r4)
+ cmpdi cr7, r6, 0
+ beq cr7, L(updater3)
+L(loop):
+ lbz r5, 0(r3)
+ cmpdi cr7, r5, 0
+ beq cr7, L(retnull)
+ cmpld cr7, r6, r5
+ bne cr7, L(loop1)
+ addi r3, r3, 1
+ addi r4, r4, 1
+ lbz r6, 0(r4)
+ cmpdi cr7, r6, 0
+ beq cr7, L(updater3)
+ b L(loop)
+
+ /* Handling return values. */
+ .align 4
+L(updater3):
+ subf r3, r31, r3 /* Reduce len of r4 from r3. */
+ b L(end)
+
+ .align 4
+L(ret_r3):
+ mr r3, r29 /* Return r3. */
+ b L(end)
+
+ .align 4
+L(retnull):
+ li r3, 0 /* Return NULL. */
+ b L(end)
+
+ .align 4
+L(default):
+ mr r4, r30
+ bl __strstr_ppc
+ nop
+
+ .align 4
+L(end):
+ addi r1, r1, FRAMESIZE /* Restore stack pointer. */
+ cfi_adjust_cfa_offset(-FRAMESIZE)
+ ld r0, 16(r1) /* Restore the saved link register. */
+ ld r28, -32(r1) /* Restore callers save register r28. */
+ ld r29, -24(r1) /* Restore callers save register r29. */
+ ld r30, -16(r1) /* Restore callers save register r30. */
+ ld r31, -8(r1) /* Restore callers save register r31. */
+ mtlr r0 /* Branch to link register. */
+ blr
+END (STRSTR)
+libc_hidden_builtin_def (strstr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/sub_n.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/sub_n.S
new file mode 100644
index 0000000000..848dad5718
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/sub_n.S
@@ -0,0 +1,23 @@
+/* PowerPC64 mpn_lshift -- mpn_add_n/mpn_sub_n -- mpn addition and
+ subtraction.
+ Copyright (C) 2013-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#define USE_AS_SUB
+#include "add_n.S"