aboutsummaryrefslogtreecommitdiff
path: root/REORG.TODO/sysdeps/powerpc/powerpc64/power8
diff options
context:
space:
mode:
Diffstat (limited to 'REORG.TODO/sysdeps/powerpc/powerpc64/power8')
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/Implies2
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/Makefile3
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/Implies1
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S303
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies1
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S508
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S56
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_finitef.S1
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S61
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isinff.S1
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S56
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isnanf.S1
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S45
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S48
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S519
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/memcmp.S1447
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S458
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/multiarch/Implies1
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/stpcpy.S24
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/stpncpy.S24
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasecmp.S457
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c29
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasestr.S538
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/strchr.S377
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/strchrnul.S23
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcmp.S247
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcpy.S270
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcspn.S20
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/strlen.S301
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncase.S20
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncmp.S327
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncpy.S465
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/strnlen.S433
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/strrchr.S464
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/strspn.S202
35 files changed, 7733 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/Implies
new file mode 100644
index 0000000000..9a5e3c7277
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc64/power7/fpu
+powerpc/powerpc64/power7
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/Makefile b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/Makefile
new file mode 100644
index 0000000000..71a59529f3
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/Makefile
@@ -0,0 +1,3 @@
+ifeq ($(subdir),string)
+sysdep_routines += strcasestr-ppc64
+endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/Implies
new file mode 100644
index 0000000000..1187cdfb0a
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power7/fpu/
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S
new file mode 100644
index 0000000000..4c42926a74
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S
@@ -0,0 +1,303 @@
+/* Optimized expf(). PowerPC64/POWER8 version.
+ Copyright (C) 2016-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Short algorithm description:
+ *
+ * Let K = 64 (table size).
+ * e^x = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y))
+ * where:
+ * x = m*log(2)/K + y, y in [0.0..log(2)/K]
+ * m = n*K + j, m,n,j - signed integer, j in [0..K-1]
+ * values of 2^(j/K) are tabulated as T[j].
+ *
+ * P(y) is a minimax polynomial approximation of expf(y)-1
+ * on small interval [0.0..log(2)/K].
+ *
+ * P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as
+ * z = y*y; P(y) = (P3*z + P1)*z + (P2*z + P0)*y
+ *
+ * Special cases:
+ * expf(NaN) = NaN
+ * expf(+INF) = +INF
+ * expf(-INF) = 0
+ * expf(x) = 1 for subnormals
+ * for finite argument, only expf(0)=1 is exact
+ * expf(x) overflows if x>88.7228317260742190
+ * expf(x) underflows if x<-103.972076416015620
+ */
+
+#define C1 0x42ad496b /* Single precision 125*log(2). */
+#define C2 0x31800000 /* Single precision 2^(-28). */
+#define SP_INF 0x7f800000 /* Single precision Inf. */
+#define SP_EXP_BIAS 0x1fc0 /* Single precision exponent bias. */
+
+#define DATA_OFFSET r9
+
+/* Implements the function
+
+ float [fp1] expf (float [fp1] x) */
+
+ .machine power8
+EALIGN(__ieee754_expf, 4, 0)
+ addis DATA_OFFSET,r2,.Lanchor@toc@ha
+ addi DATA_OFFSET,DATA_OFFSET,.Lanchor@toc@l
+
+ xscvdpspn v0,v1
+ mfvsrd r8,v0 /* r8 = x */
+ lfd fp2,(.KLN2-.Lanchor)(DATA_OFFSET)
+ lfd fp3,(.P2-.Lanchor)(DATA_OFFSET)
+ rldicl r3,r8,32,33 /* r3 = |x| */
+ lis r4,C1@ha /* r4 = 125*log(2) */
+ ori r4,r4,C1@l
+ cmpw r3,r4
+ lfd fp5,(.P3-.Lanchor)(DATA_OFFSET)
+ lfd fp4,(.RS-.Lanchor)(DATA_OFFSET)
+ fmadd fp2,fp1,fp2,fp4 /* fp2 = x * K/log(2) + (2^23 + 2^22) */
+ bge L(special_paths) /* |x| >= 125*log(2) ? */
+
+ lis r4,C2@ha
+ ori r4,r4,C2@l
+ cmpw r3,r4
+ blt L(small_args) /* |x| < 2^(-28) ? */
+
+ /* Main path: here if 2^(-28) <= |x| < 125*log(2) */
+ frsp fp6,fp2
+ xscvdpsp v2,v2
+ mfvsrd r8,v2
+ mr r3,r8 /* r3 = m */
+ rldicl r8,r8,32,58 /* r8 = j */
+ lfs fp4,(.SP_RS-.Lanchor)(DATA_OFFSET)
+ fsubs fp2,fp6,fp4 /* fp2 = m = x * K/log(2) */
+ srdi r3,r3,32
+ clrrwi r3,r3,6 /* r3 = n */
+ lfd fp6,(.NLN2K-.Lanchor)(DATA_OFFSET)
+ fmadd fp0,fp2,fp6,fp1 /* fp0 = y = x - m*log(2)/K */
+ fmul fp2,fp0,fp0 /* fp2 = z = y^2 */
+ lfd fp4,(.P1-.Lanchor)(DATA_OFFSET)
+ lfd fp6,(.P0-.Lanchor)(DATA_OFFSET)
+ lis r4,SP_EXP_BIAS@ha
+ ori r4,r4,SP_EXP_BIAS@l
+ add r3,r3,r4
+ rldic r3,r3,49,1 /* r3 = 2^n */
+ fmadd fp4,fp5,fp2,fp4 /* fp4 = P3 * z + P1 */
+ fmadd fp6,fp3,fp2,fp6 /* fp6 = P2 * z + P0 */
+ mtvsrd v1,r3
+ xscvspdp v1,v1
+ fmul fp4,fp4,fp2 /* fp4 = (P3 * z + P1)*z */
+ fmadd fp0,fp0,fp6,fp4 /* fp0 = P(y) */
+ sldi r8,r8,3 /* Access doublewords from T[j]. */
+ addi r6,DATA_OFFSET,(.Ttable-.Lanchor)
+ lfdx fp3,r6,r8
+ fmadd fp0,fp0,fp3,fp3 /* fp0 = T[j] * (1 + P(y)) */
+ fmul fp1,fp1,fp0 /* fp1 = 2^n * T[j] * (1 + P(y)) */
+ frsp fp1,fp1
+ blr
+
+ .align 4
+/* x is either underflow, overflow, infinite or NaN. */
+L(special_paths):
+ srdi r8,r8,32
+ rlwinm r8,r8,3,29,29 /* r8 = 0, if x positive.
+ r8 = 4, otherwise. */
+ addi r6,DATA_OFFSET,(.SPRANGE-.Lanchor)
+ lwzx r4,r6,r8 /* r4 = .SPRANGE[signbit(x)] */
+ cmpw r3,r4
+ /* |x| <= .SPRANGE[signbit(x)] */
+ ble L(near_under_or_overflow)
+
+ lis r4,SP_INF@ha
+ ori r4,r4,SP_INF@l
+ cmpw r3,r4
+ bge L(arg_inf_or_nan) /* |x| > Infinite ? */
+
+ addi r6,DATA_OFFSET,(.SPLARGE_SMALL-.Lanchor)
+ lfsx fp1,r6,r8
+ fmuls fp1,fp1,fp1
+ blr
+
+
+ .align 4
+L(small_args):
+ /* expf(x) = 1.0, where |x| < |2^(-28)| */
+ lfs fp2,(.SPone-.Lanchor)(DATA_OFFSET)
+ fadds fp1,fp1,fp2
+ blr
+
+
+ .align 4
+L(arg_inf_or_nan:)
+ bne L(arg_nan)
+
+ /* expf(+INF) = +INF
+ expf(-INF) = 0 */
+ addi r6,DATA_OFFSET,(.INF_ZERO-.Lanchor)
+ lfsx fp1,r6,r8
+ blr
+
+
+ .align 4
+L(arg_nan):
+ /* expf(NaN) = NaN */
+ fadd fp1,fp1,fp1
+ frsp fp1,fp1
+ blr
+
+ .align 4
+L(near_under_or_overflow):
+ frsp fp6,fp2
+ xscvdpsp v2,v2
+ mfvsrd r8,v2
+ mr r3,r8 /* r3 = m */
+ rldicl r8,r8,32,58 /* r8 = j */
+ lfs fp4,(.SP_RS-.Lanchor)(DATA_OFFSET)
+ fsubs fp2,fp6,fp4 /* fp2 = m = x * K/log(2) */
+ srdi r3,r3,32
+ clrrwi r3,r3,6 /* r3 = n */
+ lfd fp6,(.NLN2K-.Lanchor)(DATA_OFFSET)
+ fmadd fp0,fp2,fp6,fp1 /* fp0 = y = x - m*log(2)/K */
+ fmul fp2,fp0,fp0 /* fp2 = z = y^2 */
+ lfd fp4,(.P1-.Lanchor)(DATA_OFFSET)
+ lfd fp6,(.P0-.Lanchor)(DATA_OFFSET)
+ ld r4,(.DP_EXP_BIAS-.Lanchor)(DATA_OFFSET)
+ add r3,r3,r4
+ rldic r3,r3,46,1 /* r3 = 2 */
+ fmadd fp4,fp5,fp2,fp4 /* fp4 = P3 * z + P1 */
+ fmadd fp6,fp3,fp2,fp6 /* fp6 = P2 * z + P0 */
+ mtvsrd v1,r3
+ fmul fp4,fp4,fp2 /* fp4 = (P3*z + P1)*z */
+ fmadd fp0,fp0,fp6,fp4 /* fp0 = P(y) */
+ sldi r8,r8,3 /* Access doublewords from T[j]. */
+ addi r6,DATA_OFFSET,(.Ttable-.Lanchor)
+ lfdx fp3,r6,r8
+ fmadd fp0,fp0,fp3,fp3 /* fp0 = T[j] * (1 + T[j]) */
+ fmul fp1,fp1,fp0 /* fp1 = 2^n * T[j] * (1 + T[j]) */
+ frsp fp1,fp1
+ blr
+END(__ieee754_expf)
+
+ .section .rodata, "a",@progbits
+.Lanchor:
+ .balign 8
+/* Table T[j] = 2^(j/K). Double precision. */
+.Ttable:
+ .8byte 0x3ff0000000000000
+ .8byte 0x3ff02c9a3e778061
+ .8byte 0x3ff059b0d3158574
+ .8byte 0x3ff0874518759bc8
+ .8byte 0x3ff0b5586cf9890f
+ .8byte 0x3ff0e3ec32d3d1a2
+ .8byte 0x3ff11301d0125b51
+ .8byte 0x3ff1429aaea92de0
+ .8byte 0x3ff172b83c7d517b
+ .8byte 0x3ff1a35beb6fcb75
+ .8byte 0x3ff1d4873168b9aa
+ .8byte 0x3ff2063b88628cd6
+ .8byte 0x3ff2387a6e756238
+ .8byte 0x3ff26b4565e27cdd
+ .8byte 0x3ff29e9df51fdee1
+ .8byte 0x3ff2d285a6e4030b
+ .8byte 0x3ff306fe0a31b715
+ .8byte 0x3ff33c08b26416ff
+ .8byte 0x3ff371a7373aa9cb
+ .8byte 0x3ff3a7db34e59ff7
+ .8byte 0x3ff3dea64c123422
+ .8byte 0x3ff4160a21f72e2a
+ .8byte 0x3ff44e086061892d
+ .8byte 0x3ff486a2b5c13cd0
+ .8byte 0x3ff4bfdad5362a27
+ .8byte 0x3ff4f9b2769d2ca7
+ .8byte 0x3ff5342b569d4f82
+ .8byte 0x3ff56f4736b527da
+ .8byte 0x3ff5ab07dd485429
+ .8byte 0x3ff5e76f15ad2148
+ .8byte 0x3ff6247eb03a5585
+ .8byte 0x3ff6623882552225
+ .8byte 0x3ff6a09e667f3bcd
+ .8byte 0x3ff6dfb23c651a2f
+ .8byte 0x3ff71f75e8ec5f74
+ .8byte 0x3ff75feb564267c9
+ .8byte 0x3ff7a11473eb0187
+ .8byte 0x3ff7e2f336cf4e62
+ .8byte 0x3ff82589994cce13
+ .8byte 0x3ff868d99b4492ed
+ .8byte 0x3ff8ace5422aa0db
+ .8byte 0x3ff8f1ae99157736
+ .8byte 0x3ff93737b0cdc5e5
+ .8byte 0x3ff97d829fde4e50
+ .8byte 0x3ff9c49182a3f090
+ .8byte 0x3ffa0c667b5de565
+ .8byte 0x3ffa5503b23e255d
+ .8byte 0x3ffa9e6b5579fdbf
+ .8byte 0x3ffae89f995ad3ad
+ .8byte 0x3ffb33a2b84f15fb
+ .8byte 0x3ffb7f76f2fb5e47
+ .8byte 0x3ffbcc1e904bc1d2
+ .8byte 0x3ffc199bdd85529c
+ .8byte 0x3ffc67f12e57d14b
+ .8byte 0x3ffcb720dcef9069
+ .8byte 0x3ffd072d4a07897c
+ .8byte 0x3ffd5818dcfba487
+ .8byte 0x3ffda9e603db3285
+ .8byte 0x3ffdfc97337b9b5f
+ .8byte 0x3ffe502ee78b3ff6
+ .8byte 0x3ffea4afa2a490da
+ .8byte 0x3ffefa1bee615a27
+ .8byte 0x3fff50765b6e4540
+ .8byte 0x3fffa7c1819e90d8
+
+.KLN2:
+ .8byte 0x40571547652b82fe /* Double precision K/log(2). */
+
+/* Double precision polynomial coefficients. */
+.P0:
+ .8byte 0x3fefffffffffe7c6
+.P1:
+ .8byte 0x3fe00000008d6118
+.P2:
+ .8byte 0x3fc55550da752d4f
+.P3:
+ .8byte 0x3fa56420eb78fa85
+
+.RS:
+ .8byte 0x4168000000000000 /* Double precision 2^23 + 2^22. */
+.NLN2K:
+ .8byte 0xbf862e42fefa39ef /* Double precision -log(2)/K. */
+.DP_EXP_BIAS:
+ .8byte 0x000000000000ffc0 /* Double precision exponent bias. */
+
+ .balign 4
+.SPone:
+ .4byte 0x3f800000 /* Single precision 1.0. */
+.SP_RS:
+ .4byte 0x4b400000 /* Single precision 2^23 + 2^22. */
+
+.SPRANGE: /* Single precision overflow/underflow bounds. */
+ .4byte 0x42b17217 /* if x>this bound, then result overflows. */
+ .4byte 0x42cff1b4 /* if x<this bound, then result underflows. */
+
+.SPLARGE_SMALL:
+ .4byte 0x71800000 /* 2^100. */
+ .4byte 0x0d800000 /* 2^-100. */
+
+.INF_ZERO:
+ .4byte 0x7f800000 /* Single precision Inf. */
+ .4byte 0 /* Single precision zero. */
+
+strong_alias (__ieee754_expf, __expf_finite)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies
new file mode 100644
index 0000000000..7fd86fdf87
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power7/fpu/multiarch
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S
new file mode 100644
index 0000000000..8dfa0076e0
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S
@@ -0,0 +1,508 @@
+/* Optimized cosf(). PowerPC64/POWER8 version.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#define _ERRNO_H 1
+#include <bits/errno.h>
+
+#define FRAMESIZE (FRAME_MIN_SIZE+16)
+
+#define FLOAT_EXPONENT_SHIFT 23
+#define FLOAT_EXPONENT_BIAS 127
+#define INTEGER_BITS 3
+
+#define PI_4 0x3f490fdb /* PI/4 */
+#define NINEPI_4 0x40e231d6 /* 9 * PI/4 */
+#define TWO_PN5 0x3d000000 /* 2^-5 */
+#define TWO_PN27 0x32000000 /* 2^-27 */
+#define INFINITY 0x7f800000
+#define TWO_P23 0x4b000000 /* 2^23 */
+#define FX_FRACTION_1_28 0x9249250 /* 0x100000000 / 28 + 1 */
+
+ /* Implements the function
+
+ float [fp1] cosf (float [fp1] x) */
+
+ .machine power8
+EALIGN(__cosf, 4, 0)
+ addis r9,r2,L(anchor)@toc@ha
+ addi r9,r9,L(anchor)@toc@l
+
+ lis r4,PI_4@h
+ ori r4,r4,PI_4@l
+
+ xscvdpspn v0,v1
+ mfvsrd r8,v0
+ rldicl r3,r8,32,33 /* Remove sign bit. */
+
+ cmpw r3,r4
+ bge L(greater_or_equal_pio4)
+
+ lis r4,TWO_PN5@h
+ ori r4,r4,TWO_PN5@l
+
+ cmpw r3,r4
+ blt L(less_2pn5)
+
+ /* Chebyshev polynomial of the form:
+ * 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */
+
+ lfd fp9,(L(C0)-L(anchor))(r9)
+ lfd fp10,(L(C1)-L(anchor))(r9)
+ lfd fp11,(L(C2)-L(anchor))(r9)
+ lfd fp12,(L(C3)-L(anchor))(r9)
+ lfd fp13,(L(C4)-L(anchor))(r9)
+
+ fmul fp2,fp1,fp1 /* x^2 */
+ lfd fp3,(L(DPone)-L(anchor))(r9)
+
+ fmadd fp4,fp2,fp13,fp12 /* C3+x^2*C4 */
+ fmadd fp4,fp2,fp4,fp11 /* C2+x^2*(C3+x^2*C4) */
+ fmadd fp4,fp2,fp4,fp10 /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */
+ fmadd fp4,fp2,fp4,fp9 /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */
+ fmadd fp1,fp2,fp4,fp3 /* 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */
+ frsp fp1,fp1 /* Round to single precision. */
+
+ blr
+
+ .balign 16
+L(greater_or_equal_pio4):
+ lis r4,NINEPI_4@h
+ ori r4,r4,NINEPI_4@l
+ cmpw r3,r4
+ bge L(greater_or_equal_9pio4)
+
+ /* Calculate quotient of |x|/(PI/4). */
+ lfd fp2,(L(invpio4)-L(anchor))(r9)
+ fabs fp1,fp1 /* |x| */
+ fmul fp2,fp1,fp2 /* |x|/(PI/4) */
+ fctiduz fp2,fp2
+ mfvsrd r3,v2 /* n = |x| mod PI/4 */
+
+ /* Now use that quotient to find |x| mod (PI/2). */
+ addi r7,r3,1
+ rldicr r5,r7,2,60 /* ((n+1) >> 1) << 3 */
+ addi r6,r9,(L(pio2_table)-L(anchor))
+ lfdx fp4,r5,r6
+ fsub fp1,fp1,fp4
+
+ .balign 16
+L(reduced):
+ /* Now we are in the range -PI/4 to PI/4. */
+
+ /* Work out if we are in a positive or negative primary interval. */
+ addi r7,r7,2
+ rldicl r4,r7,62,63 /* ((n+3) >> 2) & 1 */
+
+ /* Load a 1.0 or -1.0. */
+ addi r5,r9,(L(ones)-L(anchor))
+ sldi r4,r4,3
+ lfdx fp0,r4,r5
+
+ /* Are we in the primary interval of sin or cos? */
+ andi. r4,r7,0x2
+ bne L(cos)
+
+ /* Chebyshev polynomial of the form:
+ x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */
+
+ lfd fp9,(L(S0)-L(anchor))(r9)
+ lfd fp10,(L(S1)-L(anchor))(r9)
+ lfd fp11,(L(S2)-L(anchor))(r9)
+ lfd fp12,(L(S3)-L(anchor))(r9)
+ lfd fp13,(L(S4)-L(anchor))(r9)
+
+ fmul fp2,fp1,fp1 /* x^2 */
+ fmul fp3,fp2,fp1 /* x^3 */
+
+ fmadd fp4,fp2,fp13,fp12 /* S3+x^2*S4 */
+ fmadd fp4,fp2,fp4,fp11 /* S2+x^2*(S3+x^2*S4) */
+ fmadd fp4,fp2,fp4,fp10 /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */
+ fmadd fp4,fp2,fp4,fp9 /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */
+ fmadd fp4,fp3,fp4,fp1 /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */
+ fmul fp4,fp4,fp0 /* Add in the sign. */
+ frsp fp1,fp4 /* Round to single precision. */
+
+ blr
+
+ .balign 16
+L(cos):
+ /* Chebyshev polynomial of the form:
+ 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */
+
+ lfd fp9,(L(C0)-L(anchor))(r9)
+ lfd fp10,(L(C1)-L(anchor))(r9)
+ lfd fp11,(L(C2)-L(anchor))(r9)
+ lfd fp12,(L(C3)-L(anchor))(r9)
+ lfd fp13,(L(C4)-L(anchor))(r9)
+
+ fmul fp2,fp1,fp1 /* x^2 */
+ lfd fp3,(L(DPone)-L(anchor))(r9)
+
+ fmadd fp4,fp2,fp13,fp12 /* C3+x^2*C4 */
+ fmadd fp4,fp2,fp4,fp11 /* C2+x^2*(C3+x^2*C4) */
+ fmadd fp4,fp2,fp4,fp10 /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */
+ fmadd fp4,fp2,fp4,fp9 /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */
+ fmadd fp4,fp2,fp4,fp3 /* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */
+ fmul fp4,fp4,fp0 /* Add in the sign. */
+ frsp fp1,fp4 /* Round to single precision. */
+
+ blr
+
+ .balign 16
+L(greater_or_equal_9pio4):
+ lis r4,INFINITY@h
+ ori r4,r4,INFINITY@l
+ cmpw r3,r4
+ bge L(inf_or_nan)
+
+ lis r4,TWO_P23@h
+ ori r4,r4,TWO_P23@l
+ cmpw r3,r4
+ bge L(greater_or_equal_2p23)
+
+ fabs fp1,fp1 /* |x| */
+
+ /* Calculate quotient of |x|/(PI/4). */
+ lfd fp2,(L(invpio4)-L(anchor))(r9)
+
+ lfd fp3,(L(DPone)-L(anchor))(r9)
+ lfd fp4,(L(DPhalf)-L(anchor))(r9)
+ fmul fp2,fp1,fp2 /* |x|/(PI/4) */
+ friz fp2,fp2 /* n = floor(|x|/(PI/4)) */
+
+ /* Calculate (n + 1) / 2. */
+ fadd fp2,fp2,fp3 /* n + 1 */
+ fmul fp3,fp2,fp4 /* (n + 1) / 2 */
+ friz fp3,fp3
+
+ lfd fp4,(L(pio2hi)-L(anchor))(r9)
+ lfd fp5,(L(pio2lo)-L(anchor))(r9)
+
+ fmul fp6,fp4,fp3
+ fadd fp6,fp6,fp1
+ fmadd fp1,fp5,fp3,fp6
+
+ fctiduz fp2,fp2
+ mfvsrd r7,v2 /* n + 1 */
+
+ b L(reduced)
+
+ .balign 16
+L(inf_or_nan):
+ bne L(skip_errno_setting) /* Is a NAN? */
+
+ /* We delayed the creation of the stack frame, as well as the saving of
+ the link register, because only at this point, we are sure that
+ doing so is actually needed. */
+
+ stfd fp1,-8(r1)
+
+ /* Save the link register. */
+ mflr r0
+ std r0,16(r1)
+ cfi_offset(lr, 16)
+
+ /* Create the stack frame. */
+ stdu r1,-FRAMESIZE(r1)
+ cfi_adjust_cfa_offset(FRAMESIZE)
+
+ bl JUMPTARGET(__errno_location)
+ nop
+
+ /* Restore the stack frame. */
+ addi r1,r1,FRAMESIZE
+ cfi_adjust_cfa_offset(-FRAMESIZE)
+ /* Restore the link register. */
+ ld r0,16(r1)
+ mtlr r0
+
+ lfd fp1,-8(r1)
+
+ /* errno = EDOM */
+ li r4,EDOM
+ stw r4,0(r3)
+
+L(skip_errno_setting):
+ fsub fp1,fp1,fp1 /* x - x */
+ blr
+
+ .balign 16
+L(greater_or_equal_2p23):
+ fabs fp1,fp1
+
+ srwi r4,r3,FLOAT_EXPONENT_SHIFT
+ subi r4,r4,FLOAT_EXPONENT_BIAS
+
+ /* We reduce the input modulo pi/4, so we need 3 bits of integer
+ to determine where in 2*pi we are. Index into our array
+ accordingly. */
+ addi r4,r4,INTEGER_BITS
+
+ /* To avoid an expensive divide, for the range we care about (0 - 127)
+ we can transform x/28 into:
+
+ x/28 = (x * ((0x100000000 / 28) + 1)) >> 32
+
+ mulhwu returns the top 32 bits of the 64 bit result, doing the
+ shift for us in the same instruction. The top 32 bits are undefined,
+ so we have to mask them. */
+
+ lis r6,FX_FRACTION_1_28@h
+ ori r6,r6,FX_FRACTION_1_28@l
+ mulhwu r5,r4,r6
+ clrldi r5,r5,32
+
+ /* Get our pointer into the invpio4_table array. */
+ sldi r4,r5,3
+ addi r6,r9,(L(invpio4_table)-L(anchor))
+ add r4,r4,r6
+
+ lfd fp2,0(r4)
+ lfd fp3,8(r4)
+ lfd fp4,16(r4)
+ lfd fp5,24(r4)
+
+ fmul fp6,fp2,fp1
+ fmul fp7,fp3,fp1
+ fmul fp8,fp4,fp1
+ fmul fp9,fp5,fp1
+
+ /* Mask off larger integer bits in highest double word that we don't
+ care about to avoid losing precision when combining with smaller
+ values. */
+ fctiduz fp10,fp6
+ mfvsrd r7,v10
+ rldicr r7,r7,0,(63-INTEGER_BITS)
+ mtvsrd v10,r7
+ fcfidu fp10,fp10 /* Integer bits. */
+
+ fsub fp6,fp6,fp10 /* highest -= integer bits */
+
+ /* Work out the integer component, rounded down. Use the top two
+ limbs for this. */
+ fadd fp10,fp6,fp7 /* highest + higher */
+
+ fctiduz fp10,fp10
+ mfvsrd r7,v10
+ andi. r0,r7,1
+ fcfidu fp10,fp10
+
+ /* Subtract integer component from highest limb. */
+ fsub fp12,fp6,fp10
+
+ beq L(even_integer)
+
+ /* Our integer component is odd, so we are in the -PI/4 to 0 primary
+ region. We need to shift our result down by PI/4, and to do this
+ in the mod (4/PI) space we simply subtract 1. */
+ lfd fp11,(L(DPone)-L(anchor))(r9)
+ fsub fp12,fp12,fp11
+
+ /* Now add up all the limbs in order. */
+ fadd fp12,fp12,fp7
+ fadd fp12,fp12,fp8
+ fadd fp12,fp12,fp9
+
+ /* And finally multiply by pi/4. */
+ lfd fp13,(L(pio4)-L(anchor))(r9)
+ fmul fp1,fp12,fp13
+
+ addi r7,r7,1
+ b L(reduced)
+
+L(even_integer):
+ lfd fp11,(L(DPone)-L(anchor))(r9)
+
+ /* Now add up all the limbs in order. */
+ fadd fp12,fp12,fp7
+ fadd fp12,r12,fp8
+ fadd fp12,r12,fp9
+
+ /* We need to check if the addition of all the limbs resulted in us
+ overflowing 1.0. */
+ fcmpu 0,fp12,fp11
+ bgt L(greater_than_one)
+
+ /* And finally multiply by pi/4. */
+ lfd fp13,(L(pio4)-L(anchor))(r9)
+ fmul fp1,fp12,fp13
+
+ addi r7,r7,1
+ b L(reduced)
+
+L(greater_than_one):
+ /* We did overflow 1.0 when adding up all the limbs. Add 1.0 to our
+ integer, and subtract 1.0 from our result. Since that makes the
+ integer component odd, we need to subtract another 1.0 as
+ explained above. */
+ addi r7,r7,1
+
+ lfd fp11,(L(DPtwo)-L(anchor))(r9)
+ fsub fp12,fp12,fp11
+
+ /* And finally multiply by pi/4. */
+ lfd fp13,(L(pio4)-L(anchor))(r9)
+ fmul fp1,fp12,fp13
+
+ addi r7,r7,1
+ b L(reduced)
+
+ .balign 16
+L(less_2pn5):
+ lis r4,TWO_PN27@h
+ ori r4,r4,TWO_PN27@l
+
+ cmpw r3,r4
+ blt L(less_2pn27)
+
+ /* A simpler Chebyshev approximation is close enough for this range:
+ 1.0+x^2*(CC0+x^3*CC1). */
+
+ lfd fp10,(L(CC0)-L(anchor))(r9)
+ lfd fp11,(L(CC1)-L(anchor))(r9)
+
+ fmul fp2,fp1,fp1 /* x^2 */
+ fmul fp3,fp2,fp1 /* x^3 */
+ lfd fp1,(L(DPone)-L(anchor))(r9)
+
+ fmadd fp4,fp3,fp11,fp10 /* CC0+x^3*CC1 */
+ fmadd fp1,fp2,fp4,fp1 /* 1.0+x^2*(CC0+x^3*CC1) */
+
+ frsp fp1,fp1 /* Round to single precision. */
+
+ blr
+
+ .balign 16
+L(less_2pn27):
+ /* Handle some special cases:
+
+ cosf(subnormal) raises inexact
+ cosf(min_normalized) raises inexact
+ cosf(normalized) raises inexact. */
+
+ lfd fp2,(L(DPone)-L(anchor))(r9)
+
+ fabs fp1,fp1 /* |x| */
+ fsub fp1,fp2,fp1 /* 1.0-|x| */
+
+ frsp fp1,fp1
+
+ blr
+
+END (__cosf)
+
+ .section .rodata, "a"
+
+ .balign 8
+
+L(anchor):
+
+ /* Chebyshev constants for sin, range -PI/4 - PI/4. */
+L(S0): .8byte 0xbfc5555555551cd9
+L(S1): .8byte 0x3f81111110c2688b
+L(S2): .8byte 0xbf2a019f8b4bd1f9
+L(S3): .8byte 0x3ec71d7264e6b5b4
+L(S4): .8byte 0xbe5a947e1674b58a
+
+ /* Chebyshev constants for cos, range 2^-27 - 2^-5. */
+L(CC0): .8byte 0xbfdfffffff5cc6fd
+L(CC1): .8byte 0x3fa55514b178dac5
+
+ /* Chebyshev constants for cos, range -PI/4 - PI/4. */
+L(C0): .8byte 0xbfdffffffffe98ae
+L(C1): .8byte 0x3fa55555545c50c7
+L(C2): .8byte 0xbf56c16b348b6874
+L(C3): .8byte 0x3efa00eb9ac43cc0
+L(C4): .8byte 0xbe923c97dd8844d7
+
+L(invpio2):
+ .8byte 0x3fe45f306dc9c883 /* 2/PI */
+
+L(invpio4):
+ .8byte 0x3ff45f306dc9c883 /* 4/PI */
+
+L(invpio4_table):
+ .8byte 0x0000000000000000
+ .8byte 0x3ff45f306c000000
+ .8byte 0x3e3c9c882a000000
+ .8byte 0x3c54fe13a8000000
+ .8byte 0x3aaf47d4d0000000
+ .8byte 0x38fbb81b6c000000
+ .8byte 0x3714acc9e0000000
+ .8byte 0x3560e4107c000000
+ .8byte 0x33bca2c756000000
+ .8byte 0x31fbd778ac000000
+ .8byte 0x300b7246e0000000
+ .8byte 0x2e5d2126e8000000
+ .8byte 0x2c97003248000000
+ .8byte 0x2ad77504e8000000
+ .8byte 0x290921cfe0000000
+ .8byte 0x274deb1cb0000000
+ .8byte 0x25829a73e0000000
+ .8byte 0x23fd1046be000000
+ .8byte 0x2224baed10000000
+ .8byte 0x20709d338e000000
+ .8byte 0x1e535a2f80000000
+ .8byte 0x1cef904e64000000
+ .8byte 0x1b0d639830000000
+ .8byte 0x1964ce7d24000000
+ .8byte 0x17b908bf16000000
+
+L(pio4):
+ .8byte 0x3fe921fb54442d18 /* PI/4 */
+
+/* PI/2 as a sum of two doubles. We only use 32 bits of the upper limb
+ to avoid losing significant bits when multiplying with up to
+ (2^22)/(pi/2). */
+L(pio2hi):
+ .8byte 0xbff921fb54400000
+
+L(pio2lo):
+ .8byte 0xbdd0b4611a626332
+
+L(pio2_table):
+ .8byte 0
+ .8byte 0x3ff921fb54442d18 /* 1 * PI/2 */
+ .8byte 0x400921fb54442d18 /* 2 * PI/2 */
+ .8byte 0x4012d97c7f3321d2 /* 3 * PI/2 */
+ .8byte 0x401921fb54442d18 /* 4 * PI/2 */
+ .8byte 0x401f6a7a2955385e /* 5 * PI/2 */
+ .8byte 0x4022d97c7f3321d2 /* 6 * PI/2 */
+ .8byte 0x4025fdbbe9bba775 /* 7 * PI/2 */
+ .8byte 0x402921fb54442d18 /* 8 * PI/2 */
+ .8byte 0x402c463abeccb2bb /* 9 * PI/2 */
+ .8byte 0x402f6a7a2955385e /* 10 * PI/2 */
+
+L(small):
+ .8byte 0x3cd0000000000000 /* 2^-50 */
+
+L(ones):
+ .8byte 0x3ff0000000000000 /* +1.0 */
+ .8byte 0xbff0000000000000 /* -1.0 */
+
+L(DPhalf):
+ .8byte 0x3fe0000000000000 /* 0.5 */
+
+L(DPone):
+ .8byte 0x3ff0000000000000 /* 1.0 */
+
+L(DPtwo):
+ .8byte 0x4000000000000000 /* 2.0 */
+
+weak_alias(__cosf, cosf)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
new file mode 100644
index 0000000000..fcdcb60293
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
@@ -0,0 +1,56 @@
+/* isfinite(). PowerPC64/POWER8 version.
+ Copyright (C) 2014-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */
+
+/* int [r3] __finite ([fp1] x) */
+
+EALIGN (__finite, 4, 0)
+ CALL_MCOUNT 0
+ MFVSRD_R3_V1
+ lis r9,0x8010
+ clrldi r3,r3,1 /* r3 = r3 & 0x8000000000000000 */
+ rldicr r9,r9,32,31 /* r9 = (r9 << 32) & 0xffffffff */
+ add r3,r3,r9
+ rldicl r3,r3,1,63
+ blr
+END (__finite)
+
+hidden_def (__finite)
+weak_alias (__finite, finite)
+
+/* It turns out that the 'double' version will also always work for
+ single-precision. */
+strong_alias (__finite, __finitef)
+hidden_def (__finitef)
+weak_alias (__finitef, finitef)
+
+#if IS_IN (libm)
+# if LONG_DOUBLE_COMPAT (libm, GLIBC_2_0)
+compat_symbol (libm, __finite, __finitel, GLIBC_2_0)
+compat_symbol (libm, finite, finitel, GLIBC_2_0)
+# endif
+#else
+# if LONG_DOUBLE_COMPAT (libc, GLIBC_2_0)
+compat_symbol (libc, __finite, __finitel, GLIBC_2_0);
+compat_symbol (libc, finite, finitel, GLIBC_2_0);
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_finitef.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_finitef.S
new file mode 100644
index 0000000000..54bd94176d
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_finitef.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_finite.S. */
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
new file mode 100644
index 0000000000..32814e4525
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
@@ -0,0 +1,61 @@
+/* isinf(). PowerPC64/POWER8 version.
+ Copyright (C) 2014-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */
+
+/* int [r3] __isinf([fp1] x) */
+
+EALIGN (__isinf, 4, 0)
+ CALL_MCOUNT 0
+ MFVSRD_R3_V1
+ lis r9,0x7ff0 /* r9 = 0x7ff0 */
+ rldicl r10,r3,0,1 /* r10 = r3 & (0x8000000000000000) */
+ sldi r9,r9,32 /* r9 = r9 << 52 */
+ cmpd cr7,r10,r9 /* fp1 & 0x7ff0000000000000 ? */
+ beq cr7,L(inf)
+ li r3,0 /* Not inf */
+ blr
+L(inf):
+ sradi r3,r3,63 /* r3 = r3 >> 63 */
+ ori r3,r3,1 /* r3 = r3 | 0x1 */
+ blr
+END (__isinf)
+
+hidden_def (__isinf)
+weak_alias (__isinf, isinf)
+
+/* It turns out that the 'double' version will also always work for
+ single-precision. */
+strong_alias (__isinf, __isinff)
+hidden_def (__isinff)
+weak_alias (__isinff, isinff)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isinf, __isinfl)
+weak_alias (__isinf, isinfl)
+#endif
+
+#if !IS_IN (libm)
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0);
+compat_symbol (libc, isinf, isinfl, GLIBC_2_0);
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isinff.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isinff.S
new file mode 100644
index 0000000000..be759e091e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isinff.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isinf.S. */
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
new file mode 100644
index 0000000000..af52e502b7
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
@@ -0,0 +1,56 @@
+/* isnan(). PowerPC64/POWER8 version.
+ Copyright (C) 2014-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */
+
+/* int [r3] __isnan([f1] x) */
+
+EALIGN (__isnan, 4, 0)
+ CALL_MCOUNT 0
+ MFVSRD_R3_V1
+ lis r9,0x7ff0
+ clrldi r3,r3,1 /* r3 = r3 & 0x8000000000000000 */
+ rldicr r9,r9,32,31 /* r9 = (r9 << 32) & 0xffffffff */
+ subf r3,r3,r9
+ rldicl r3,r3,1,63
+ blr
+END (__isnan)
+
+hidden_def (__isnan)
+weak_alias (__isnan, isnan)
+
+/* It turns out that the 'double' version will also always work for
+ single-precision. */
+strong_alias (__isnan, __isnanf)
+hidden_def (__isnanf)
+weak_alias (__isnanf, isnanf)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isnan, __isnanl)
+weak_alias (__isnan, isnanl)
+#endif
+
+#if !IS_IN (libm)
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0);
+compat_symbol (libc, isnan, isnanl, GLIBC_2_0);
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isnanf.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isnanf.S
new file mode 100644
index 0000000000..b48c85e0d3
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isnanf.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isnan.S. */
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
new file mode 100644
index 0000000000..aa180b6901
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
@@ -0,0 +1,45 @@
+/* Round double to long int. POWER8 PowerPC64 version.
+ Copyright (C) 2014-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */
+
+/* long long int[r3] __llrint (double x[fp1]) */
+ENTRY (__llrint)
+ CALL_MCOUNT 0
+ fctid fp1,fp1
+ MFVSRD_R3_V1
+ blr
+END (__llrint)
+
+strong_alias (__llrint, __lrint)
+weak_alias (__llrint, llrint)
+weak_alias (__lrint, lrint)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__llrint, __llrintl)
+weak_alias (__llrint, llrintl)
+strong_alias (__lrint, __lrintl)
+weak_alias (__lrint, lrintl)
+#endif
+#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_1)
+compat_symbol (libm, __llrint, llrintl, GLIBC_2_1)
+compat_symbol (libm, __lrint, lrintl, GLIBC_2_1)
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
new file mode 100644
index 0000000000..043fc6a089
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
@@ -0,0 +1,48 @@
+/* llround function. POWER8 PowerPC64 version.
+ Copyright (C) 2014-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <endian.h>
+#include <math_ldbl_opt.h>
+
+#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */
+
+/* long long [r3] llround (float x [fp1]) */
+
+ENTRY (__llround)
+ CALL_MCOUNT 0
+ frin fp1,fp1 /* Round to nearest +-0.5. */
+ fctidz fp1,fp1 /* Convert To Integer DW round toward 0. */
+ MFVSRD_R3_V1
+ blr
+END (__llround)
+
+strong_alias (__llround, __lround)
+weak_alias (__llround, llround)
+weak_alias (__lround, lround)
+
+#ifdef NO_LONG_DOUBLE
+weak_alias (__llround, llroundl)
+strong_alias (__llround, __llroundl)
+weak_alias (__lround, lroundl)
+strong_alias (__lround, __lroundl)
+#endif
+#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_1)
+compat_symbol (libm, __llround, llroundl, GLIBC_2_1)
+compat_symbol (libm, __lround, lroundl, GLIBC_2_1)
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S
new file mode 100644
index 0000000000..fb0add3462
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S
@@ -0,0 +1,519 @@
+/* Optimized sinf(). PowerPC64/POWER8 version.
+ Copyright (C) 2016-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#define _ERRNO_H 1
+#include <bits/errno.h>
+
+#define FRAMESIZE (FRAME_MIN_SIZE+16)
+
+#define FLOAT_EXPONENT_SHIFT 23
+#define FLOAT_EXPONENT_BIAS 127
+#define INTEGER_BITS 3
+
+#define PI_4 0x3f490fdb /* PI/4 */
+#define NINEPI_4 0x40e231d6 /* 9 * PI/4 */
+#define TWO_PN5 0x3d000000 /* 2^-5 */
+#define TWO_PN27 0x32000000 /* 2^-27 */
+#define INFINITY 0x7f800000
+#define TWO_P23 0x4b000000 /* 2^27 */
+#define FX_FRACTION_1_28 0x9249250 /* 0x100000000 / 28 + 1 */
+
+ /* Implements the function
+
+ float [fp1] sinf (float [fp1] x) */
+
+ .machine power8
+EALIGN(__sinf, 4, 0)
+ addis r9,r2,L(anchor)@toc@ha
+ addi r9,r9,L(anchor)@toc@l
+
+ lis r4,PI_4@h
+ ori r4,r4,PI_4@l
+
+ xscvdpspn v0,v1
+ mfvsrd r8,v0
+ rldicl r3,r8,32,33 /* Remove sign bit. */
+
+ cmpw r3,r4
+ bge L(greater_or_equal_pio4)
+
+ lis r4,TWO_PN5@h
+ ori r4,r4,TWO_PN5@l
+
+ cmpw r3,r4
+ blt L(less_2pn5)
+
+ /* Chebyshev polynomial of the form:
+ * x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */
+
+ lfd fp9,(L(S0)-L(anchor))(r9)
+ lfd fp10,(L(S1)-L(anchor))(r9)
+ lfd fp11,(L(S2)-L(anchor))(r9)
+ lfd fp12,(L(S3)-L(anchor))(r9)
+ lfd fp13,(L(S4)-L(anchor))(r9)
+
+ fmul fp2,fp1,fp1 /* x^2 */
+ fmul fp3,fp2,fp1 /* x^3 */
+
+ fmadd fp4,fp2,fp13,fp12 /* S3+x^2*S4 */
+ fmadd fp4,fp2,fp4,fp11 /* S2+x^2*(S3+x^2*S4) */
+ fmadd fp4,fp2,fp4,fp10 /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */
+ fmadd fp4,fp2,fp4,fp9 /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */
+ fmadd fp1,fp3,fp4,fp1 /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */
+ frsp fp1,fp1 /* Round to single precision. */
+
+ blr
+
+ .balign 16
+L(greater_or_equal_pio4):
+ lis r4,NINEPI_4@h
+ ori r4,r4,NINEPI_4@l
+ cmpw r3,r4
+ bge L(greater_or_equal_9pio4)
+
+ /* Calculate quotient of |x|/(PI/4). */
+ lfd fp2,(L(invpio4)-L(anchor))(r9)
+ fabs fp1,fp1 /* |x| */
+ fmul fp2,fp1,fp2 /* |x|/(PI/4) */
+ fctiduz fp2,fp2
+ mfvsrd r3,v2 /* n = |x| mod PI/4 */
+
+ /* Now use that quotient to find |x| mod (PI/2). */
+ addi r7,r3,1
+ rldicr r5,r7,2,60 /* ((n+1) >> 1) << 3 */
+ addi r6,r9,(L(pio2_table)-L(anchor))
+ lfdx fp4,r5,r6
+ fsub fp1,fp1,fp4
+
+ .balign 16
+L(reduced):
+ /* Now we are in the range -PI/4 to PI/4. */
+
+ /* Work out if we are in a positive or negative primary interval. */
+ rldicl r4,r7,62,63 /* ((n+1) >> 2) & 1 */
+
+ /* We are operating on |x|, so we need to add back the original
+ sign. */
+ rldicl r8,r8,33,63 /* (x >> 31) & 1, ie the sign bit. */
+ xor r4,r4,r8 /* 0 if result should be positive,
+ 1 if negative. */
+
+ /* Load a 1.0 or -1.0. */
+ addi r5,r9,(L(ones)-L(anchor))
+ sldi r4,r4,3
+ lfdx fp0,r4,r5
+
+ /* Are we in the primary interval of sin or cos? */
+ andi. r4,r7,0x2
+ bne L(cos)
+
+ /* Chebyshev polynomial of the form:
+ x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */
+
+ lfd fp9,(L(S0)-L(anchor))(r9)
+ lfd fp10,(L(S1)-L(anchor))(r9)
+ lfd fp11,(L(S2)-L(anchor))(r9)
+ lfd fp12,(L(S3)-L(anchor))(r9)
+ lfd fp13,(L(S4)-L(anchor))(r9)
+
+ fmul fp2,fp1,fp1 /* x^2 */
+ fmul fp3,fp2,fp1 /* x^3 */
+
+ fmadd fp4,fp2,fp13,fp12 /* S3+x^2*S4 */
+ fmadd fp4,fp2,fp4,fp11 /* S2+x^2*(S3+x^2*S4) */
+ fmadd fp4,fp2,fp4,fp10 /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */
+ fmadd fp4,fp2,fp4,fp9 /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */
+ fmadd fp4,fp3,fp4,fp1 /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */
+ fmul fp4,fp4,fp0 /* Add in the sign. */
+ frsp fp1,fp4 /* Round to single precision. */
+
+ blr
+
+ .balign 16
+L(cos):
+ /* Chebyshev polynomial of the form:
+ 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */
+
+ lfd fp9,(L(C0)-L(anchor))(r9)
+ lfd fp10,(L(C1)-L(anchor))(r9)
+ lfd fp11,(L(C2)-L(anchor))(r9)
+ lfd fp12,(L(C3)-L(anchor))(r9)
+ lfd fp13,(L(C4)-L(anchor))(r9)
+
+ fmul fp2,fp1,fp1 /* x^2 */
+ lfd fp3,(L(DPone)-L(anchor))(r9)
+
+ fmadd fp4,fp2,fp13,fp12 /* C3+x^2*C4 */
+ fmadd fp4,fp2,fp4,fp11 /* C2+x^2*(C3+x^2*C4) */
+ fmadd fp4,fp2,fp4,fp10 /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */
+ fmadd fp4,fp2,fp4,fp9 /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */
+ fmadd fp4,fp2,fp4,fp3 /* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */
+ fmul fp4,fp4,fp0 /* Add in the sign. */
+ frsp fp1,fp4 /* Round to single precision. */
+
+ blr
+
+ .balign 16
+L(greater_or_equal_9pio4):
+ lis r4,INFINITY@h
+ ori r4,r4,INFINITY@l
+ cmpw r3,r4
+ bge L(inf_or_nan)
+
+ lis r4,TWO_P23@h
+ ori r4,r4,TWO_P23@l
+ cmpw r3,r4
+ bge L(greater_or_equal_2p23)
+
+ fabs fp1,fp1 /* |x| */
+
+ /* Calculate quotient of |x|/(PI/4). */
+ lfd fp2,(L(invpio4)-L(anchor))(r9)
+
+ lfd fp3,(L(DPone)-L(anchor))(r9)
+ lfd fp4,(L(DPhalf)-L(anchor))(r9)
+ fmul fp2,fp1,fp2 /* |x|/(PI/4) */
+ friz fp2,fp2 /* n = floor(|x|/(PI/4)) */
+
+ /* Calculate (n + 1) / 2. */
+ fadd fp2,fp2,fp3 /* n + 1 */
+ fmul fp3,fp2,fp4 /* (n + 1) / 2 */
+ friz fp3,fp3
+
+ lfd fp4,(L(pio2hi)-L(anchor))(r9)
+ lfd fp5,(L(pio2lo)-L(anchor))(r9)
+
+ fmul fp6,fp4,fp3
+ fadd fp6,fp6,fp1
+ fmadd fp1,fp5,fp3,fp6
+
+ fctiduz fp2,fp2
+ mfvsrd r7,v2 /* n + 1 */
+
+ b L(reduced)
+
+ .balign 16
+L(inf_or_nan):
+ bne L(skip_errno_setting) /* Is a NAN? */
+
+ /* We delayed the creation of the stack frame, as well as the saving of
+ the link register, because only at this point, we are sure that
+ doing so is actually needed. */
+
+ stfd fp1,-8(r1)
+
+ /* Save the link register. */
+ mflr r0
+ std r0,16(r1)
+ cfi_offset(lr, 16)
+
+ /* Create the stack frame. */
+ stdu r1,-FRAMESIZE(r1)
+ cfi_adjust_cfa_offset(FRAMESIZE)
+
+ bl JUMPTARGET(__errno_location)
+ nop
+
+ /* Restore the stack frame. */
+ addi r1,r1,FRAMESIZE
+ cfi_adjust_cfa_offset(-FRAMESIZE)
+ /* Restore the link register. */
+ ld r0,16(r1)
+ mtlr r0
+
+ lfd fp1,-8(r1)
+
+ /* errno = EDOM */
+ li r4,EDOM
+ stw r4,0(r3)
+
+L(skip_errno_setting):
+ fsub fp1,fp1,fp1 /* x - x */
+ blr
+
+ .balign 16
+L(greater_or_equal_2p23):
+ fabs fp1,fp1
+
+ srwi r4,r3,FLOAT_EXPONENT_SHIFT
+ subi r4,r4,FLOAT_EXPONENT_BIAS
+
+ /* We reduce the input modulo pi/4, so we need 3 bits of integer
+ to determine where in 2*pi we are. Index into our array
+ accordingly. */
+ addi r4,r4,INTEGER_BITS
+
+ /* To avoid an expensive divide, for the range we care about (0 - 127)
+ we can transform x/28 into:
+
+ x/28 = (x * ((0x100000000 / 28) + 1)) >> 32
+
+ mulhwu returns the top 32 bits of the 64 bit result, doing the
+ shift for us in the same instruction. The top 32 bits are undefined,
+ so we have to mask them. */
+
+ lis r6,FX_FRACTION_1_28@h
+ ori r6,r6,FX_FRACTION_1_28@l
+ mulhwu r5,r4,r6
+ clrldi r5,r5,32
+
+ /* Get our pointer into the invpio4_table array. */
+ sldi r4,r5,3
+ addi r6,r9,(L(invpio4_table)-L(anchor))
+ add r4,r4,r6
+
+ lfd fp2,0(r4)
+ lfd fp3,8(r4)
+ lfd fp4,16(r4)
+ lfd fp5,24(r4)
+
+ fmul fp6,fp2,fp1
+ fmul fp7,fp3,fp1
+ fmul fp8,fp4,fp1
+ fmul fp9,fp5,fp1
+
+ /* Mask off larger integer bits in highest double word that we don't
+ care about to avoid losing precision when combining with smaller
+ values. */
+ fctiduz fp10,fp6
+ mfvsrd r7,v10
+ rldicr r7,r7,0,(63-INTEGER_BITS)
+ mtvsrd v10,r7
+ fcfidu fp10,fp10 /* Integer bits. */
+
+ fsub fp6,fp6,fp10 /* highest -= integer bits */
+
+ /* Work out the integer component, rounded down. Use the top two
+ limbs for this. */
+ fadd fp10,fp6,fp7 /* highest + higher */
+
+ fctiduz fp10,fp10
+ mfvsrd r7,v10
+ andi. r0,r7,1
+ fcfidu fp10,fp10
+
+ /* Subtract integer component from highest limb. */
+ fsub fp12,fp6,fp10
+
+ beq L(even_integer)
+
+ /* Our integer component is odd, so we are in the -PI/4 to 0 primary
+ region. We need to shift our result down by PI/4, and to do this
+ in the mod (4/PI) space we simply subtract 1. */
+ lfd fp11,(L(DPone)-L(anchor))(r9)
+ fsub fp12,fp12,fp11
+
+ /* Now add up all the limbs in order. */
+ fadd fp12,fp12,fp7
+ fadd fp12,fp12,fp8
+ fadd fp12,fp12,fp9
+
+ /* And finally multiply by pi/4. */
+ lfd fp13,(L(pio4)-L(anchor))(r9)
+ fmul fp1,fp12,fp13
+
+ addi r7,r7,1
+ b L(reduced)
+
+L(even_integer):
+ lfd fp11,(L(DPone)-L(anchor))(r9)
+
+ /* Now add up all the limbs in order. */
+ fadd fp12,fp12,fp7
+ fadd fp12,r12,fp8
+ fadd fp12,r12,fp9
+
+ /* We need to check if the addition of all the limbs resulted in us
+ overflowing 1.0. */
+ fcmpu 0,fp12,fp11
+ bgt L(greater_than_one)
+
+ /* And finally multiply by pi/4. */
+ lfd fp13,(L(pio4)-L(anchor))(r9)
+ fmul fp1,fp12,fp13
+
+ addi r7,r7,1
+ b L(reduced)
+
+L(greater_than_one):
+ /* We did overflow 1.0 when adding up all the limbs. Add 1.0 to our
+ integer, and subtract 1.0 from our result. Since that makes the
+ integer component odd, we need to subtract another 1.0 as
+ explained above. */
+ addi r7,r7,1
+
+ lfd fp11,(L(DPtwo)-L(anchor))(r9)
+ fsub fp12,fp12,fp11
+
+ /* And finally multiply by pi/4. */
+ lfd fp13,(L(pio4)-L(anchor))(r9)
+ fmul fp1,fp12,fp13
+
+ addi r7,r7,1
+ b L(reduced)
+
+ .balign 16
+L(less_2pn5):
+ lis r4,TWO_PN27@h
+ ori r4,r4,TWO_PN27@l
+
+ cmpw r3,r4
+ blt L(less_2pn27)
+
+ /* A simpler Chebyshev approximation is close enough for this range:
+ x+x^3*(SS0+x^2*SS1). */
+
+ lfd fp10,(L(SS0)-L(anchor))(r9)
+ lfd fp11,(L(SS1)-L(anchor))(r9)
+
+ fmul fp2,fp1,fp1 /* x^2 */
+ fmul fp3,fp2,fp1 /* x^3 */
+
+ fmadd fp4,fp2,fp11,fp10 /* SS0+x^2*SS1 */
+ fmadd fp1,fp3,fp4,fp1 /* x+x^3*(SS0+x^2*SS1) */
+
+ frsp fp1,fp1 /* Round to single precision. */
+
+ blr
+
+ .balign 16
+L(less_2pn27):
+ cmpwi r3,0
+ beq L(zero)
+
+ /* Handle some special cases:
+
+ sinf(subnormal) raises inexact/underflow
+ sinf(min_normalized) raises inexact/underflow
+ sinf(normalized) raises inexact. */
+
+ lfd fp2,(L(small)-L(anchor))(r9)
+
+ fmul fp2,fp1,fp2 /* x * small */
+ fsub fp1,fp1,fp2 /* x - x * small */
+
+ frsp fp1,fp1
+
+ blr
+
+ .balign 16
+L(zero):
+ blr
+
+END (__sinf)
+
+ .section .rodata, "a"
+
+ .balign 8
+
+L(anchor):
+
+ /* Chebyshev constants for sin, range -PI/4 - PI/4. */
+L(S0): .8byte 0xbfc5555555551cd9
+L(S1): .8byte 0x3f81111110c2688b
+L(S2): .8byte 0xbf2a019f8b4bd1f9
+L(S3): .8byte 0x3ec71d7264e6b5b4
+L(S4): .8byte 0xbe5a947e1674b58a
+
+ /* Chebyshev constants for sin, range 2^-27 - 2^-5. */
+L(SS0): .8byte 0xbfc555555543d49d
+L(SS1): .8byte 0x3f8110f475cec8c5
+
+ /* Chebyshev constants for cos, range -PI/4 - PI/4. */
+L(C0): .8byte 0xbfdffffffffe98ae
+L(C1): .8byte 0x3fa55555545c50c7
+L(C2): .8byte 0xbf56c16b348b6874
+L(C3): .8byte 0x3efa00eb9ac43cc0
+L(C4): .8byte 0xbe923c97dd8844d7
+
+L(invpio2):
+ .8byte 0x3fe45f306dc9c883 /* 2/PI */
+
+L(invpio4):
+ .8byte 0x3ff45f306dc9c883 /* 4/PI */
+
+L(invpio4_table):
+ .8byte 0x0000000000000000
+ .8byte 0x3ff45f306c000000
+ .8byte 0x3e3c9c882a000000
+ .8byte 0x3c54fe13a8000000
+ .8byte 0x3aaf47d4d0000000
+ .8byte 0x38fbb81b6c000000
+ .8byte 0x3714acc9e0000000
+ .8byte 0x3560e4107c000000
+ .8byte 0x33bca2c756000000
+ .8byte 0x31fbd778ac000000
+ .8byte 0x300b7246e0000000
+ .8byte 0x2e5d2126e8000000
+ .8byte 0x2c97003248000000
+ .8byte 0x2ad77504e8000000
+ .8byte 0x290921cfe0000000
+ .8byte 0x274deb1cb0000000
+ .8byte 0x25829a73e0000000
+ .8byte 0x23fd1046be000000
+ .8byte 0x2224baed10000000
+ .8byte 0x20709d338e000000
+ .8byte 0x1e535a2f80000000
+ .8byte 0x1cef904e64000000
+ .8byte 0x1b0d639830000000
+ .8byte 0x1964ce7d24000000
+ .8byte 0x17b908bf16000000
+
+L(pio4):
+ .8byte 0x3fe921fb54442d18 /* PI/4 */
+
+/* PI/2 as a sum of two doubles. We only use 32 bits of the upper limb
+ to avoid losing significant bits when multiplying with up to
+ (2^22)/(pi/2). */
+L(pio2hi):
+ .8byte 0xbff921fb54400000
+
+L(pio2lo):
+ .8byte 0xbdd0b4611a626332
+
+L(pio2_table):
+ .8byte 0
+ .8byte 0x3ff921fb54442d18 /* 1 * PI/2 */
+ .8byte 0x400921fb54442d18 /* 2 * PI/2 */
+ .8byte 0x4012d97c7f3321d2 /* 3 * PI/2 */
+ .8byte 0x401921fb54442d18 /* 4 * PI/2 */
+ .8byte 0x401f6a7a2955385e /* 5 * PI/2 */
+ .8byte 0x4022d97c7f3321d2 /* 6 * PI/2 */
+ .8byte 0x4025fdbbe9bba775 /* 7 * PI/2 */
+ .8byte 0x402921fb54442d18 /* 8 * PI/2 */
+ .8byte 0x402c463abeccb2bb /* 9 * PI/2 */
+ .8byte 0x402f6a7a2955385e /* 10 * PI/2 */
+
+L(small):
+ .8byte 0x3cd0000000000000 /* 2^-50 */
+
+L(ones):
+ .8byte 0x3ff0000000000000 /* +1.0 */
+ .8byte 0xbff0000000000000 /* -1.0 */
+
+L(DPhalf):
+ .8byte 0x3fe0000000000000 /* 0.5 */
+
+L(DPone):
+ .8byte 0x3ff0000000000000 /* 1.0 */
+
+L(DPtwo):
+ .8byte 0x4000000000000000 /* 2.0 */
+
+weak_alias(__sinf, sinf)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memcmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memcmp.S
new file mode 100644
index 0000000000..46b9c0067a
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memcmp.S
@@ -0,0 +1,1447 @@
+/* Optimized memcmp implementation for POWER7/PowerPC64.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* int [r3] memcmp (const char *s1 [r3],
+ const char *s2 [r4],
+ size_t size [r5]) */
+
+/* TODO: change these to the actual instructions when the minimum required
+ binutils allows it. */
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#ifndef MEMCMP
+# define MEMCMP memcmp
+#endif
+ .machine power7
+EALIGN (MEMCMP, 4, 0)
+ CALL_MCOUNT 3
+
+#define rRTN r3
+#define rSTR1 r3 /* First string arg. */
+#define rSTR2 r4 /* Second string arg. */
+#define rN r5 /* Max string length. */
+#define rWORD1 r6 /* Current word in s1. */
+#define rWORD2 r7 /* Current word in s2. */
+#define rWORD3 r8 /* Next word in s1. */
+#define rWORD4 r9 /* Next word in s2. */
+#define rWORD5 r10 /* Next word in s1. */
+#define rWORD6 r11 /* Next word in s2. */
+
+#define rOFF8 r20 /* 8 bytes offset. */
+#define rOFF16 r21 /* 16 bytes offset. */
+#define rOFF24 r22 /* 24 bytes offset. */
+#define rOFF32 r23 /* 24 bytes offset. */
+#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
+#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
+#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
+#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
+#define rSHR r28 /* Unaligned shift right count. */
+#define rSHL r29 /* Unaligned shift left count. */
+#define rWORD7 r30 /* Next word in s1. */
+#define rWORD8 r31 /* Next word in s2. */
+
+#define rWORD8SAVE (-8)
+#define rWORD7SAVE (-16)
+#define rOFF8SAVE (-24)
+#define rOFF16SAVE (-32)
+#define rOFF24SAVE (-40)
+#define rOFF32SAVE (-48)
+#define rSHRSAVE (-56)
+#define rSHLSAVE (-64)
+#define rWORD8SHIFTSAVE (-72)
+#define rWORD2SHIFTSAVE (-80)
+#define rWORD4SHIFTSAVE (-88)
+#define rWORD6SHIFTSAVE (-96)
+
+#ifdef __LITTLE_ENDIAN__
+# define LD ldbrx
+#else
+# define LD ldx
+#endif
+
+ xor r10, rSTR2, rSTR1
+ cmpldi cr6, rN, 0
+ cmpldi cr1, rN, 8
+ clrldi. r0, r10, 61
+ clrldi r12, rSTR1, 61
+ cmpldi cr5, r12, 0
+ beq- cr6, L(zeroLength)
+ dcbt 0, rSTR1
+ dcbt 0, rSTR2
+ /* If less than 8 bytes or not aligned, use the unaligned
+ byte loop. */
+ blt cr1, L(bytealigned)
+ bne L(unalignedqw)
+/* At this point we know both strings have the same alignment and the
+ compare length is at least 8 bytes. r12 contains the low order
+ 3 bits of rSTR1 and cr5 contains the result of the logical compare
+ of r12 to 0. If r12 == 0 then we are already double word
+ aligned and can perform the DW aligned loop. */
+
+ .align 4
+L(samealignment):
+ or r11, rSTR2, rSTR1
+ clrldi. r11, r11, 60
+ beq L(qw_align)
+ /* Try to align to QW else proceed to DW loop. */
+ clrldi. r10, r10, 60
+ bne L(DW)
+ /* For the difference to reach QW alignment, load as DW. */
+ clrrdi rSTR1, rSTR1, 3
+ clrrdi rSTR2, rSTR2, 3
+ subfic r10, r12, 8
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
+ sldi r9, r10, 3
+ subfic r9, r9, 64
+ sld rWORD1, rWORD1, r9
+ sld rWORD2, rWORD2, r9
+ cmpld cr6, rWORD1, rWORD2
+ addi rSTR1, rSTR1, 8
+ addi rSTR2, rSTR2, 8
+ bne cr6, L(ret_diff)
+ subf rN, r10, rN
+
+ cmpld cr6, r11, r12
+ bgt cr6, L(qw_align)
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
+ cmpld cr6, rWORD1, rWORD2
+ addi rSTR1, rSTR1, 8
+ addi rSTR2, rSTR2, 8
+ bne cr6, L(different)
+ cmpldi cr6, rN, 8
+ ble cr6, L(zeroLength)
+ addi rN, rN, -8
+ /* Now both rSTR1 and rSTR2 are aligned to QW. */
+ .align 4
+L(qw_align):
+ vspltisb v0, 0
+ srdi. r6, rN, 6
+ li r8, 16
+ li r10, 32
+ li r11, 48
+ ble cr0, L(lessthan64)
+ mtctr r6
+ vspltisb v8, 0
+ vspltisb v6, 0
+ /* Aligned vector loop. */
+ .align 4
+L(aligned_loop):
+ lvx v4, 0, rSTR1
+ lvx v5, 0, rSTR2
+ vcmpequb. v7, v6, v8
+ bnl cr6, L(different3)
+ lvx v6, rSTR1, r8
+ lvx v8, rSTR2, r8
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different2)
+ lvx v4, rSTR1, r10
+ lvx v5, rSTR2, r10
+ vcmpequb. v7, v6, v8
+ bnl cr6, L(different3)
+ lvx v6, rSTR1, r11
+ lvx v8, rSTR2, r11
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different2)
+ addi rSTR1, rSTR1, 64
+ addi rSTR2, rSTR2, 64
+ bdnz L(aligned_loop)
+ vcmpequb. v7, v6, v8
+ bnl cr6, L(different3)
+ clrldi rN, rN, 58
+ /* Handle remainder for aligned loop. */
+ .align 4
+L(lessthan64):
+ mr r9, rSTR1
+ cmpdi cr6, rN, 0
+ li rSTR1, 0
+ blelr cr6
+ lvx v4, 0, r9
+ lvx v5, 0, rSTR2
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different1)
+ addi rN, rN, -16
+
+ cmpdi cr6, rN, 0
+ blelr cr6
+ lvx v4, r9, r8
+ lvx v5, rSTR2, r8
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different1)
+ addi rN, rN, -16
+
+ cmpdi cr6, rN, 0
+ blelr cr6
+ lvx v4, r9, r10
+ lvx v5, rSTR2, r10
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different1)
+ addi rN, rN, -16
+
+ cmpdi cr6, rN, 0
+ blelr cr6
+ lvx v4, r9, r11
+ lvx v5, rSTR2, r11
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different1)
+ blr
+
+ /* Calculate and return the difference. */
+ .align 4
+L(different1):
+ cmpdi cr6, rN, 16
+ bge cr6, L(different2)
+ /* Discard unwanted bytes. */
+#ifdef __LITTLE_ENDIAN__
+ lvsr v1, 0, rN
+ vperm v4, v4, v0, v1
+ vperm v5, v5, v0, v1
+#else
+ lvsl v1, 0, rN
+ vperm v4, v0, v4, v1
+ vperm v5, v0, v5, v1
+#endif
+ vcmpequb. v7, v4, v5
+ li rRTN, 0
+ bltlr cr6
+ .align 4
+L(different2):
+#ifdef __LITTLE_ENDIAN__
+ /* Reverse bytes for direct comparison. */
+ lvsl v10, r0, r0
+ vspltisb v8, 15
+ vsububm v9, v8, v10
+ vperm v4, v4, v0, v9
+ vperm v5, v5, v0, v9
+#endif
+ MFVRD(r7, v4)
+ MFVRD(r9, v5)
+ cmpld cr6, r7, r9
+ bne cr6, L(ret_diff)
+ /* Difference in second DW. */
+ vsldoi v4, v4, v4, 8
+ vsldoi v5, v5, v5, 8
+ MFVRD(r7, v4)
+ MFVRD(r9, v5)
+ cmpld cr6, r7, r9
+L(ret_diff):
+ li rRTN, 1
+ bgtlr cr6
+ li rRTN, -1
+ blr
+ .align 4
+L(different3):
+#ifdef __LITTLE_ENDIAN__
+ /* Reverse bytes for direct comparison. */
+ vspltisb v9, 15
+ lvsl v10, r0, r0
+ vsububm v9, v9, v10
+ vperm v6, v6, v0, v9
+ vperm v8, v8, v0, v9
+#endif
+ MFVRD(r7, v6)
+ MFVRD(r9, v8)
+ cmpld cr6, r7, r9
+ bne cr6, L(ret_diff)
+ /* Difference in second DW. */
+ vsldoi v6, v6, v6, 8
+ vsldoi v8, v8, v8, 8
+ MFVRD(r7, v6)
+ MFVRD(r9, v8)
+ cmpld cr6, r7, r9
+ li rRTN, 1
+ bgtlr cr6
+ li rRTN, -1
+ blr
+
+ .align 4
+L(different):
+ cmpldi cr7, rN, 8
+ bgt cr7, L(end)
+ /* Skip unwanted bytes. */
+ sldi r8, rN, 3
+ subfic r8, r8, 64
+ srd rWORD1, rWORD1, r8
+ srd rWORD2, rWORD2, r8
+ cmpld cr6, rWORD1, rWORD2
+ li rRTN, 0
+ beqlr cr6
+L(end):
+ li rRTN, 1
+ bgtlr cr6
+ li rRTN, -1
+ blr
+
+ .align 4
+L(unalignedqw):
+ /* Proceed to DW unaligned loop,if there is a chance of pagecross. */
+ rldicl r9, rSTR1, 0, 52
+ add r9, r9, rN
+ cmpldi cr0, r9, 4096-16
+ bgt cr0, L(unaligned)
+ rldicl r9, rSTR2, 0, 52
+ add r9, r9, rN
+ cmpldi cr0, r9, 4096-16
+ bgt cr0, L(unaligned)
+ li r0, 0
+ li r8, 16
+ vspltisb v0, 0
+ /* Check if rSTR1 is aligned to QW. */
+ andi. r11, rSTR1, 0xF
+ beq L(s1_align)
+
+ /* Compare 16B and align S1 to QW. */
+#ifdef __LITTLE_ENDIAN__
+ lvsr v10, 0, rSTR1 /* Compute mask. */
+ lvsr v6, 0, rSTR2 /* Compute mask. */
+#else
+ lvsl v10, 0, rSTR1 /* Compute mask. */
+ lvsl v6, 0, rSTR2 /* Compute mask. */
+#endif
+ lvx v5, 0, rSTR2
+ lvx v9, rSTR2, r8
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v9, v5, v6
+#else
+ vperm v5, v5, v9, v6
+#endif
+ lvx v4, 0, rSTR1
+ lvx v9, rSTR1, r8
+#ifdef __LITTLE_ENDIAN__
+ vperm v4, v9, v4, v10
+#else
+ vperm v4, v4, v9, v10
+#endif
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different1)
+ cmpldi cr6, rN, 16
+ ble cr6, L(zeroLength)
+ subfic r11, r11, 16
+ subf rN, r11, rN
+ add rSTR1, rSTR1, r11
+ add rSTR2, rSTR2, r11
+
+ /* As s1 is QW aligned prepare for unaligned loop. */
+ .align 4
+L(s1_align):
+#ifdef __LITTLE_ENDIAN__
+ lvsr v6, 0, rSTR2
+#else
+ lvsl v6, 0, rSTR2
+#endif
+ lvx v5, 0, rSTR2
+ srdi. r6, rN, 6
+ li r10, 32
+ li r11, 48
+ ble cr0, L(lessthan64_unalign)
+ mtctr r6
+ li r9, 64
+ /* Unaligned vector loop. */
+ .align 4
+L(unalign_qwloop):
+ lvx v4, 0, rSTR1
+ lvx v10, rSTR2, r8
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v10, v5, v6
+#else
+ vperm v5, v5, v10, v6
+#endif
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different2)
+ vor v5, v10, v10
+ lvx v4, rSTR1, r8
+ lvx v10, rSTR2, r10
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v10, v5, v6
+#else
+ vperm v5, v5, v10, v6
+#endif
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different2)
+ vor v5, v10, v10
+ lvx v4, rSTR1, r10
+ lvx v10, rSTR2, r11
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v10, v5, v6
+#else
+ vperm v5, v5, v10, v6
+#endif
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different2)
+ vor v5, v10, v10
+ lvx v4, rSTR1, r11
+ lvx v10, rSTR2, r9
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v10, v5, v6
+#else
+ vperm v5, v5, v10, v6
+#endif
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different2)
+ vor v5, v10, v10
+ addi rSTR1, rSTR1, 64
+ addi rSTR2, rSTR2, 64
+ bdnz L(unalign_qwloop)
+ clrldi rN, rN, 58
+ /* Handle remainder for unaligned loop. */
+ .align 4
+L(lessthan64_unalign):
+ mr r9, rSTR1
+ cmpdi cr6, rN, 0
+ li rSTR1, 0
+ blelr cr6
+ lvx v4, 0, r9
+ lvx v10, rSTR2, r8
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v10, v5, v6
+#else
+ vperm v5, v5, v10, v6
+#endif
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different1)
+ vor v5, v10, v10
+ addi rN, rN, -16
+
+ cmpdi cr6, rN, 0
+ blelr cr6
+ lvx v4, r9, r8
+ lvx v10, rSTR2, r10
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v10, v5, v6
+#else
+ vperm v5, v5, v10, v6
+#endif
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different1)
+ vor v5, v10, v10
+ addi rN, rN, -16
+
+ cmpdi cr6, rN, 0
+ blelr cr6
+ lvx v4, r9, r10
+ lvx v10, rSTR2, r11
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v10, v5, v6
+#else
+ vperm v5, v5, v10, v6
+#endif
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different1)
+ vor v5, v10, v10
+ addi rN, rN, -16
+
+ cmpdi cr6, rN, 0
+ blelr cr6
+ lvx v4, r9, r11
+ addi r11, r11, 16
+ lvx v10, rSTR2, r11
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v10, v5, v6
+#else
+ vperm v5, v5, v10, v6
+#endif
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different1)
+ blr
+
+/* Otherwise we know the two strings have the same alignment (but not
+ yet DW). So we force the string addresses to the next lower DW
+ boundary and special case this first DW using shift left to
+ eliminate bits preceding the first byte. Since we want to join the
+ normal (DW aligned) compare loop, starting at the second double word,
+ we need to adjust the length (rN) and special case the loop
+ versioning for the first DW. This ensures that the loop count is
+ correct and the first DW (shifted) is in the expected register pair. */
+ .align 4
+L(DW):
+ std rWORD8, rWORD8SAVE(r1)
+ std rWORD7, rWORD7SAVE(r1)
+ std rOFF8, rOFF8SAVE(r1)
+ std rOFF16, rOFF16SAVE(r1)
+ std rOFF24, rOFF24SAVE(r1)
+ std rOFF32, rOFF32SAVE(r1)
+ cfi_offset(rWORD8, rWORD8SAVE)
+ cfi_offset(rWORD7, rWORD7SAVE)
+ cfi_offset(rOFF8, rOFF8SAVE)
+ cfi_offset(rOFF16, rOFF16SAVE)
+ cfi_offset(rOFF24, rOFF24SAVE)
+ cfi_offset(rOFF32, rOFF32SAVE)
+
+ li rOFF8,8
+ li rOFF16,16
+ li rOFF24,24
+ li rOFF32,32
+ clrrdi rSTR1, rSTR1, 3
+ clrrdi rSTR2, rSTR2, 3
+ beq cr5, L(DWaligned)
+ add rN, rN, r12
+ sldi rWORD6, r12, 3
+ srdi r0, rN, 5 /* Divide by 32. */
+ andi. r12, rN, 24 /* Get the DW remainder. */
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
+ cmpldi cr1, r12, 16
+ cmpldi cr7, rN, 32
+ clrldi rN, rN, 61
+ beq L(dPs4)
+ mtctr r0
+ bgt cr1, L(dPs3)
+ beq cr1, L(dPs2)
+
+/* Remainder is 8. */
+ .align 3
+L(dsP1):
+ sld rWORD5, rWORD1, rWORD6
+ sld rWORD6, rWORD2, rWORD6
+ cmpld cr5, rWORD5, rWORD6
+ blt cr7, L(dP1x)
+/* Do something useful in this cycle since we have to branch anyway. */
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+ b L(dP1e)
+/* Remainder is 16. */
+ .align 4
+L(dPs2):
+ sld rWORD5, rWORD1, rWORD6
+ sld rWORD6, rWORD2, rWORD6
+ cmpld cr6, rWORD5, rWORD6
+ blt cr7, L(dP2x)
+/* Do something useful in this cycle since we have to branch anyway. */
+ LD rWORD7, rOFF8, rSTR1
+ LD rWORD8, rOFF8, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+ b L(dP2e)
+/* Remainder is 24. */
+ .align 4
+L(dPs3):
+ sld rWORD3, rWORD1, rWORD6
+ sld rWORD4, rWORD2, rWORD6
+ cmpld cr1, rWORD3, rWORD4
+ b L(dP3e)
+/* Count is a multiple of 32, remainder is 0. */
+ .align 4
+L(dPs4):
+ mtctr r0
+ sld rWORD1, rWORD1, rWORD6
+ sld rWORD2, rWORD2, rWORD6
+ cmpld cr7, rWORD1, rWORD2
+ b L(dP4e)
+
+/* At this point we know both strings are double word aligned and the
+ compare length is at least 8 bytes. */
+ .align 4
+L(DWaligned):
+ andi. r12, rN, 24 /* Get the DW remainder. */
+ srdi r0, rN, 5 /* Divide by 32. */
+ cmpldi cr1, r12, 16
+ cmpldi cr7, rN, 32
+ clrldi rN, rN, 61
+ beq L(dP4)
+ bgt cr1, L(dP3)
+ beq cr1, L(dP2)
+
+/* Remainder is 8. */
+ .align 4
+L(dP1):
+ mtctr r0
+/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
+ (8-15 byte compare), we want to use only volatile registers. This
+ means we can avoid restoring non-volatile registers since we did not
+ change any on the early exit path. The key here is the non-early
+ exit path only cares about the condition code (cr5), not about which
+ register pair was used. */
+ LD rWORD5, 0, rSTR1
+ LD rWORD6, 0, rSTR2
+ cmpld cr5, rWORD5, rWORD6
+ blt cr7, L(dP1x)
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+L(dP1e):
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ bne cr5, L(dLcr5x)
+ bne cr7, L(dLcr7x)
+
+ LD rWORD7, rOFF32, rSTR1
+ LD rWORD8, rOFF32, rSTR2
+ addi rSTR1, rSTR1, 32
+ addi rSTR2, rSTR2, 32
+ bne cr1, L(dLcr1)
+ cmpld cr5, rWORD7, rWORD8
+ bdnz L(dLoop)
+ bne cr6, L(dLcr6)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+ .align 3
+L(dP1x):
+ sldi. r12, rN, 3
+ bne cr5, L(dLcr5x)
+ subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
+ bne L(d00)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 0
+ blr
+
+/* Remainder is 16. */
+ .align 4
+L(dP2):
+ mtctr r0
+ LD rWORD5, 0, rSTR1
+ LD rWORD6, 0, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ blt cr7, L(dP2x)
+ LD rWORD7, rOFF8, rSTR1
+ LD rWORD8, rOFF8, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+L(dP2e):
+ LD rWORD1, rOFF16, rSTR1
+ LD rWORD2, rOFF16, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+ LD rWORD3, rOFF24, rSTR1
+ LD rWORD4, rOFF24, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ addi rSTR1, rSTR1, 8
+ addi rSTR2, rSTR2, 8
+ bne cr6, L(dLcr6)
+ bne cr5, L(dLcr5)
+ b L(dLoop2)
+ .align 4
+L(dP2x):
+ LD rWORD3, rOFF8, rSTR1
+ LD rWORD4, rOFF8, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ sldi. r12, rN, 3
+ bne cr6, L(dLcr6x)
+ addi rSTR1, rSTR1, 8
+ addi rSTR2, rSTR2, 8
+ bne cr1, L(dLcr1x)
+ subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
+ bne L(d00)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 0
+ blr
+
+/* Remainder is 24. */
+ .align 4
+L(dP3):
+ mtctr r0
+ LD rWORD3, 0, rSTR1
+ LD rWORD4, 0, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+L(dP3e):
+ LD rWORD5, rOFF8, rSTR1
+ LD rWORD6, rOFF8, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ blt cr7, L(dP3x)
+ LD rWORD7, rOFF16, rSTR1
+ LD rWORD8, rOFF16, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+ LD rWORD1, rOFF24, rSTR1
+ LD rWORD2, rOFF24, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+ addi rSTR1, rSTR1, 16
+ addi rSTR2, rSTR2, 16
+ bne cr1, L(dLcr1)
+ bne cr6, L(dLcr6)
+ b L(dLoop1)
+/* Again we are on a early exit path (24-31 byte compare), we want to
+ only use volatile registers and avoid restoring non-volatile
+ registers. */
+ .align 4
+L(dP3x):
+ LD rWORD1, rOFF16, rSTR1
+ LD rWORD2, rOFF16, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+ sldi. r12, rN, 3
+ bne cr1, L(dLcr1x)
+ addi rSTR1, rSTR1, 16
+ addi rSTR2, rSTR2, 16
+ bne cr6, L(dLcr6x)
+ subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
+ bne cr7, L(dLcr7x)
+ bne L(d00)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 0
+ blr
+
+/* Count is a multiple of 32, remainder is 0. */
+ .align 4
+L(dP4):
+ mtctr r0
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+L(dP4e):
+ LD rWORD3, rOFF8, rSTR1
+ LD rWORD4, rOFF8, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ LD rWORD5, rOFF16, rSTR1
+ LD rWORD6, rOFF16, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ LD rWORD7, rOFF24, rSTR1
+ LD rWORD8, rOFF24, rSTR2
+ addi rSTR1, rSTR1, 24
+ addi rSTR2, rSTR2, 24
+ cmpld cr5, rWORD7, rWORD8
+ bne cr7, L(dLcr7)
+ bne cr1, L(dLcr1)
+ bdz- L(d24) /* Adjust CTR as we start with +4. */
+/* This is the primary loop. */
+ .align 4
+L(dLoop):
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ bne cr6, L(dLcr6)
+L(dLoop1):
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ bne cr5, L(dLcr5)
+L(dLoop2):
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+ bne cr7, L(dLcr7)
+L(dLoop3):
+ LD rWORD7, rOFF32, rSTR1
+ LD rWORD8, rOFF32, rSTR2
+ addi rSTR1, rSTR1, 32
+ addi rSTR2, rSTR2, 32
+ bne cr1, L(dLcr1)
+ cmpld cr7, rWORD1, rWORD2
+ bdnz L(dLoop)
+
+L(dL4):
+ cmpld cr1, rWORD3, rWORD4
+ bne cr6, L(dLcr6)
+ cmpld cr6, rWORD5, rWORD6
+ bne cr5, L(dLcr5)
+ cmpld cr5, rWORD7, rWORD8
+L(d44):
+ bne cr7, L(dLcr7)
+L(d34):
+ bne cr1, L(dLcr1)
+L(d24):
+ bne cr6, L(dLcr6)
+L(d14):
+ sldi. r12, rN, 3
+ bne cr5, L(dLcr5)
+L(d04):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+ subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
+ beq L(duzeroLength)
+/* At this point we have a remainder of 1 to 7 bytes to compare. Since
+ we are aligned it is safe to load the whole double word, and use
+ shift right double to eliminate bits beyond the compare length. */
+L(d00):
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
+ srd rWORD1, rWORD1, rN
+ srd rWORD2, rWORD2, rN
+ cmpld cr7, rWORD1, rWORD2
+ bne cr7, L(dLcr7x)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 0
+ blr
+
+ .align 4
+L(dLcr7):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+L(dLcr7x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 1
+ bgtlr cr7
+ li rRTN, -1
+ blr
+ .align 4
+L(dLcr1):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+L(dLcr1x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 1
+ bgtlr cr1
+ li rRTN, -1
+ blr
+ .align 4
+L(dLcr6):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+L(dLcr6x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 1
+ bgtlr cr6
+ li rRTN, -1
+ blr
+ .align 4
+L(dLcr5):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+L(dLcr5x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 1
+ bgtlr cr5
+ li rRTN, -1
+ blr
+
+ .align 4
+L(bytealigned):
+ mtctr rN
+
+/* We need to prime this loop. This loop is swing modulo scheduled
+ to avoid pipe delays. The dependent instruction latencies (load to
+ compare to conditional branch) is 2 to 3 cycles. In this loop each
+ dispatch group ends in a branch and takes 1 cycle. Effectively
+ the first iteration of the loop only serves to load operands and
+ branches based on compares are delayed until the next loop.
+
+ So we must precondition some registers and condition codes so that
+ we don't exit the loop early on the first iteration. */
+
+ lbz rWORD1, 0(rSTR1)
+ lbz rWORD2, 0(rSTR2)
+ bdz L(b11)
+ cmpld cr7, rWORD1, rWORD2
+ lbz rWORD3, 1(rSTR1)
+ lbz rWORD4, 1(rSTR2)
+ bdz L(b12)
+ cmpld cr1, rWORD3, rWORD4
+ lbzu rWORD5, 2(rSTR1)
+ lbzu rWORD6, 2(rSTR2)
+ bdz L(b13)
+ .align 4
+L(bLoop):
+ lbzu rWORD1, 1(rSTR1)
+ lbzu rWORD2, 1(rSTR2)
+ bne cr7, L(bLcr7)
+
+ cmpld cr6, rWORD5, rWORD6
+ bdz L(b3i)
+
+ lbzu rWORD3, 1(rSTR1)
+ lbzu rWORD4, 1(rSTR2)
+ bne cr1, L(bLcr1)
+
+ cmpld cr7, rWORD1, rWORD2
+ bdz L(b2i)
+
+ lbzu rWORD5, 1(rSTR1)
+ lbzu rWORD6, 1(rSTR2)
+ bne cr6, L(bLcr6)
+
+ cmpld cr1, rWORD3, rWORD4
+ bdnz L(bLoop)
+
+/* We speculatively loading bytes before we have tested the previous
+ bytes. But we must avoid overrunning the length (in the ctr) to
+ prevent these speculative loads from causing a segfault. In this
+ case the loop will exit early (before the all pending bytes are
+ tested. In this case we must complete the pending operations
+ before returning. */
+L(b1i):
+ bne cr7, L(bLcr7)
+ bne cr1, L(bLcr1)
+ b L(bx56)
+ .align 4
+L(b2i):
+ bne cr6, L(bLcr6)
+ bne cr7, L(bLcr7)
+ b L(bx34)
+ .align 4
+L(b3i):
+ bne cr1, L(bLcr1)
+ bne cr6, L(bLcr6)
+ b L(bx12)
+ .align 4
+L(bLcr7):
+ li rRTN, 1
+ bgtlr cr7
+ li rRTN, -1
+ blr
+L(bLcr1):
+ li rRTN, 1
+ bgtlr cr1
+ li rRTN, -1
+ blr
+L(bLcr6):
+ li rRTN, 1
+ bgtlr cr6
+ li rRTN, -1
+ blr
+
+L(b13):
+ bne cr7, L(bx12)
+ bne cr1, L(bx34)
+L(bx56):
+ sub rRTN, rWORD5, rWORD6
+ blr
+ nop
+L(b12):
+ bne cr7, L(bx12)
+L(bx34):
+ sub rRTN, rWORD3, rWORD4
+ blr
+L(b11):
+L(bx12):
+ sub rRTN, rWORD1, rWORD2
+ blr
+
+ .align 4
+L(zeroLength):
+ li rRTN, 0
+ blr
+
+ .align 4
+/* At this point we know the strings have different alignment and the
+ compare length is at least 8 bytes. r12 contains the low order
+ 3 bits of rSTR1 and cr5 contains the result of the logical compare
+ of r12 to 0. If r12 == 0 then rStr1 is double word
+ aligned and can perform the DWunaligned loop.
+
+ Otherwise we know that rSTR1 is not already DW aligned yet.
+ So we can force the string addresses to the next lower DW
+ boundary and special case this first DW using shift left to
+ eliminate bits preceding the first byte. Since we want to join the
+ normal (DWaligned) compare loop, starting at the second double word,
+ we need to adjust the length (rN) and special case the loop
+ versioning for the first DW. This ensures that the loop count is
+ correct and the first DW (shifted) is in the expected resister pair. */
+L(unaligned):
+ std rWORD8, rWORD8SAVE(r1)
+ std rWORD7, rWORD7SAVE(r1)
+ std rOFF8, rOFF8SAVE(r1)
+ std rOFF16, rOFF16SAVE(r1)
+ std rOFF24, rOFF24SAVE(r1)
+ std rOFF32, rOFF32SAVE(r1)
+ cfi_offset(rWORD8, rWORD8SAVE)
+ cfi_offset(rWORD7, rWORD7SAVE)
+ cfi_offset(rOFF8, rOFF8SAVE)
+ cfi_offset(rOFF16, rOFF16SAVE)
+ cfi_offset(rOFF24, rOFF24SAVE)
+ cfi_offset(rOFF32, rOFF32SAVE)
+ li rOFF8,8
+ li rOFF16,16
+ li rOFF24,24
+ li rOFF32,32
+ std rSHL, rSHLSAVE(r1)
+ cfi_offset(rSHL, rSHLSAVE)
+ clrldi rSHL, rSTR2, 61
+ beq cr6, L(duzeroLength)
+ std rSHR, rSHRSAVE(r1)
+ cfi_offset(rSHR, rSHRSAVE)
+ beq cr5, L(DWunaligned)
+ std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+ cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
+/* Adjust the logical start of rSTR2 to compensate for the extra bits
+ in the 1st rSTR1 DW. */
+ sub rWORD8_SHIFT, rSTR2, r12
+/* But do not attempt to address the DW before that DW that contains
+ the actual start of rSTR2. */
+ clrrdi rSTR2, rSTR2, 3
+ std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+/* Compute the left/right shift counts for the unaligned rSTR2,
+ compensating for the logical (DW aligned) start of rSTR1. */
+ clrldi rSHL, rWORD8_SHIFT, 61
+ clrrdi rSTR1, rSTR1, 3
+ std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+ sldi rSHL, rSHL, 3
+ cmpld cr5, rWORD8_SHIFT, rSTR2
+ add rN, rN, r12
+ sldi rWORD6, r12, 3
+ std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
+ cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
+ cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
+ subfic rSHR, rSHL, 64
+ srdi r0, rN, 5 /* Divide by 32. */
+ andi. r12, rN, 24 /* Get the DW remainder. */
+/* We normally need to load 2 DWs to start the unaligned rSTR2, but in
+ this special case those bits may be discarded anyway. Also we
+ must avoid loading a DW where none of the bits are part of rSTR2 as
+ this may cross a page boundary and cause a page fault. */
+ li rWORD8, 0
+ blt cr5, L(dus0)
+ LD rWORD8, 0, rSTR2
+ addi rSTR2, rSTR2, 8
+ sld rWORD8, rWORD8, rSHL
+
+L(dus0):
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
+ cmpldi cr1, r12, 16
+ cmpldi cr7, rN, 32
+ srd r12, rWORD2, rSHR
+ clrldi rN, rN, 61
+ beq L(duPs4)
+ mtctr r0
+ or rWORD8, r12, rWORD8
+ bgt cr1, L(duPs3)
+ beq cr1, L(duPs2)
+
+/* Remainder is 8. */
+ .align 4
+L(dusP1):
+ sld rWORD8_SHIFT, rWORD2, rSHL
+ sld rWORD7, rWORD1, rWORD6
+ sld rWORD8, rWORD8, rWORD6
+ bge cr7, L(duP1e)
+/* At this point we exit early with the first double word compare
+ complete and remainder of 0 to 7 bytes. See L(du14) for details on
+ how we handle the remaining bytes. */
+ cmpld cr5, rWORD7, rWORD8
+ sldi. rN, rN, 3
+ bne cr5, L(duLcr5)
+ cmpld cr7, rN, rSHR
+ beq L(duZeroReturn)
+ li r0, 0
+ ble cr7, L(dutrim)
+ LD rWORD2, rOFF8, rSTR2
+ srd r0, rWORD2, rSHR
+ b L(dutrim)
+/* Remainder is 16. */
+ .align 4
+L(duPs2):
+ sld rWORD6_SHIFT, rWORD2, rSHL
+ sld rWORD5, rWORD1, rWORD6
+ sld rWORD6, rWORD8, rWORD6
+ b L(duP2e)
+/* Remainder is 24. */
+ .align 4
+L(duPs3):
+ sld rWORD4_SHIFT, rWORD2, rSHL
+ sld rWORD3, rWORD1, rWORD6
+ sld rWORD4, rWORD8, rWORD6
+ b L(duP3e)
+/* Count is a multiple of 32, remainder is 0. */
+ .align 4
+L(duPs4):
+ mtctr r0
+ or rWORD8, r12, rWORD8
+ sld rWORD2_SHIFT, rWORD2, rSHL
+ sld rWORD1, rWORD1, rWORD6
+ sld rWORD2, rWORD8, rWORD6
+ b L(duP4e)
+
+/* At this point we know rSTR1 is double word aligned and the
+ compare length is at least 8 bytes. */
+ .align 4
+L(DWunaligned):
+ std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+ clrrdi rSTR2, rSTR2, 3
+ std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+ srdi r0, rN, 5 /* Divide by 32. */
+ std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+ andi. r12, rN, 24 /* Get the DW remainder. */
+ std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
+ cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
+ cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
+ cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
+ sldi rSHL, rSHL, 3
+ LD rWORD6, 0, rSTR2
+ LD rWORD8, rOFF8, rSTR2
+ addi rSTR2, rSTR2, 8
+ cmpldi cr1, r12, 16
+ cmpldi cr7, rN, 32
+ clrldi rN, rN, 61
+ subfic rSHR, rSHL, 64
+ sld rWORD6_SHIFT, rWORD6, rSHL
+ beq L(duP4)
+ mtctr r0
+ bgt cr1, L(duP3)
+ beq cr1, L(duP2)
+
+/* Remainder is 8. */
+ .align 4
+L(duP1):
+ srd r12, rWORD8, rSHR
+ LD rWORD7, 0, rSTR1
+ sld rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
+ blt cr7, L(duP1x)
+L(duP1e):
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+ srd r0, rWORD2, rSHR
+ sld rWORD2_SHIFT, rWORD2, rSHL
+ or rWORD2, r0, rWORD8_SHIFT
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+ srd r12, rWORD4, rSHR
+ sld rWORD4_SHIFT, rWORD4, rSHL
+ bne cr5, L(duLcr5)
+ or rWORD4, r12, rWORD2_SHIFT
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ srd r0, rWORD6, rSHR
+ sld rWORD6_SHIFT, rWORD6, rSHL
+ bne cr7, L(duLcr7)
+ or rWORD6, r0, rWORD4_SHIFT
+ cmpld cr6, rWORD5, rWORD6
+ b L(duLoop3)
+ .align 4
+/* At this point we exit early with the first double word compare
+ complete and remainder of 0 to 7 bytes. See L(du14) for details on
+ how we handle the remaining bytes. */
+L(duP1x):
+ cmpld cr5, rWORD7, rWORD8
+ sldi. rN, rN, 3
+ bne cr5, L(duLcr5)
+ cmpld cr7, rN, rSHR
+ beq L(duZeroReturn)
+ li r0, 0
+ ble cr7, L(dutrim)
+ LD rWORD2, rOFF8, rSTR2
+ srd r0, rWORD2, rSHR
+ b L(dutrim)
+/* Remainder is 16. */
+ .align 4
+L(duP2):
+ srd r0, rWORD8, rSHR
+ LD rWORD5, 0, rSTR1
+ or rWORD6, r0, rWORD6_SHIFT
+ sld rWORD6_SHIFT, rWORD8, rSHL
+L(duP2e):
+ LD rWORD7, rOFF8, rSTR1
+ LD rWORD8, rOFF8, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ srd r12, rWORD8, rSHR
+ sld rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
+ blt cr7, L(duP2x)
+ LD rWORD1, rOFF16, rSTR1
+ LD rWORD2, rOFF16, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+ bne cr6, L(duLcr6)
+ srd r0, rWORD2, rSHR
+ sld rWORD2_SHIFT, rWORD2, rSHL
+ or rWORD2, r0, rWORD8_SHIFT
+ LD rWORD3, rOFF24, rSTR1
+ LD rWORD4, rOFF24, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+ bne cr5, L(duLcr5)
+ srd r12, rWORD4, rSHR
+ sld rWORD4_SHIFT, rWORD4, rSHL
+ or rWORD4, r12, rWORD2_SHIFT
+ addi rSTR1, rSTR1, 8
+ addi rSTR2, rSTR2, 8
+ cmpld cr1, rWORD3, rWORD4
+ b L(duLoop2)
+ .align 4
+L(duP2x):
+ cmpld cr5, rWORD7, rWORD8
+ addi rSTR1, rSTR1, 8
+ addi rSTR2, rSTR2, 8
+ bne cr6, L(duLcr6)
+ sldi. rN, rN, 3
+ bne cr5, L(duLcr5)
+ cmpld cr7, rN, rSHR
+ beq L(duZeroReturn)
+ li r0, 0
+ ble cr7, L(dutrim)
+ LD rWORD2, rOFF8, rSTR2
+ srd r0, rWORD2, rSHR
+ b L(dutrim)
+
+/* Remainder is 24. */
+ .align 4
+L(duP3):
+ srd r12, rWORD8, rSHR
+ LD rWORD3, 0, rSTR1
+ sld rWORD4_SHIFT, rWORD8, rSHL
+ or rWORD4, r12, rWORD6_SHIFT
+L(duP3e):
+ LD rWORD5, rOFF8, rSTR1
+ LD rWORD6, rOFF8, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ srd r0, rWORD6, rSHR
+ sld rWORD6_SHIFT, rWORD6, rSHL
+ or rWORD6, r0, rWORD4_SHIFT
+ LD rWORD7, rOFF16, rSTR1
+ LD rWORD8, rOFF16, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ bne cr1, L(duLcr1)
+ srd r12, rWORD8, rSHR
+ sld rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
+ blt cr7, L(duP3x)
+ LD rWORD1, rOFF24, rSTR1
+ LD rWORD2, rOFF24, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+ bne cr6, L(duLcr6)
+ srd r0, rWORD2, rSHR
+ sld rWORD2_SHIFT, rWORD2, rSHL
+ or rWORD2, r0, rWORD8_SHIFT
+ addi rSTR1, rSTR1, 16
+ addi rSTR2, rSTR2, 16
+ cmpld cr7, rWORD1, rWORD2
+ b L(duLoop1)
+ .align 4
+L(duP3x):
+ addi rSTR1, rSTR1, 16
+ addi rSTR2, rSTR2, 16
+ cmpld cr5, rWORD7, rWORD8
+ bne cr6, L(duLcr6)
+ sldi. rN, rN, 3
+ bne cr5, L(duLcr5)
+ cmpld cr7, rN, rSHR
+ beq L(duZeroReturn)
+ li r0, 0
+ ble cr7, L(dutrim)
+ LD rWORD2, rOFF8, rSTR2
+ srd r0, rWORD2, rSHR
+ b L(dutrim)
+
+/* Count is a multiple of 32, remainder is 0. */
+ .align 4
+L(duP4):
+ mtctr r0
+ srd r0, rWORD8, rSHR
+ LD rWORD1, 0, rSTR1
+ sld rWORD2_SHIFT, rWORD8, rSHL
+ or rWORD2, r0, rWORD6_SHIFT
+L(duP4e):
+ LD rWORD3, rOFF8, rSTR1
+ LD rWORD4, rOFF8, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+ srd r12, rWORD4, rSHR
+ sld rWORD4_SHIFT, rWORD4, rSHL
+ or rWORD4, r12, rWORD2_SHIFT
+ LD rWORD5, rOFF16, rSTR1
+ LD rWORD6, rOFF16, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ bne cr7, L(duLcr7)
+ srd r0, rWORD6, rSHR
+ sld rWORD6_SHIFT, rWORD6, rSHL
+ or rWORD6, r0, rWORD4_SHIFT
+ LD rWORD7, rOFF24, rSTR1
+ LD rWORD8, rOFF24, rSTR2
+ addi rSTR1, rSTR1, 24
+ addi rSTR2, rSTR2, 24
+ cmpld cr6, rWORD5, rWORD6
+ bne cr1, L(duLcr1)
+ srd r12, rWORD8, rSHR
+ sld rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
+ cmpld cr5, rWORD7, rWORD8
+ bdz L(du24) /* Adjust CTR as we start with +4. */
+/* This is the primary loop. */
+ .align 4
+L(duLoop):
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ bne cr6, L(duLcr6)
+ srd r0, rWORD2, rSHR
+ sld rWORD2_SHIFT, rWORD2, rSHL
+ or rWORD2, r0, rWORD8_SHIFT
+L(duLoop1):
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ bne cr5, L(duLcr5)
+ srd r12, rWORD4, rSHR
+ sld rWORD4_SHIFT, rWORD4, rSHL
+ or rWORD4, r12, rWORD2_SHIFT
+L(duLoop2):
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+ bne cr7, L(duLcr7)
+ srd r0, rWORD6, rSHR
+ sld rWORD6_SHIFT, rWORD6, rSHL
+ or rWORD6, r0, rWORD4_SHIFT
+L(duLoop3):
+ LD rWORD7, rOFF32, rSTR1
+ LD rWORD8, rOFF32, rSTR2
+ addi rSTR1, rSTR1, 32
+ addi rSTR2, rSTR2, 32
+ cmpld cr7, rWORD1, rWORD2
+ bne cr1, L(duLcr1)
+ srd r12, rWORD8, rSHR
+ sld rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
+ bdnz L(duLoop)
+
+L(duL4):
+ cmpld cr1, rWORD3, rWORD4
+ bne cr6, L(duLcr6)
+ cmpld cr6, rWORD5, rWORD6
+ bne cr5, L(duLcr5)
+ cmpld cr5, rWORD7, rWORD8
+L(du44):
+ bne cr7, L(duLcr7)
+L(du34):
+ bne cr1, L(duLcr1)
+L(du24):
+ bne cr6, L(duLcr6)
+L(du14):
+ sldi. rN, rN, 3
+ bne cr5, L(duLcr5)
+/* At this point we have a remainder of 1 to 7 bytes to compare. We use
+ shift right double to eliminate bits beyond the compare length.
+
+ However it may not be safe to load rWORD2 which may be beyond the
+ string length. So we compare the bit length of the remainder to
+ the right shift count (rSHR). If the bit count is less than or equal
+ we do not need to load rWORD2 (all significant bits are already in
+ rWORD8_SHIFT). */
+ cmpld cr7, rN, rSHR
+ beq L(duZeroReturn)
+ li r0, 0
+ ble cr7, L(dutrim)
+ LD rWORD2, rOFF8, rSTR2
+ srd r0, rWORD2, rSHR
+ .align 4
+L(dutrim):
+ LD rWORD1, rOFF8, rSTR1
+ ld rWORD8, -8(r1)
+ subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
+ or rWORD2, r0, rWORD8_SHIFT
+ ld rWORD7, rWORD7SAVE(r1)
+ ld rSHL, rSHLSAVE(r1)
+ srd rWORD1, rWORD1, rN
+ srd rWORD2, rWORD2, rN
+ ld rSHR, rSHRSAVE(r1)
+ ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+ li rRTN, 0
+ cmpld cr7, rWORD1, rWORD2
+ ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+ ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+ beq cr7, L(dureturn24)
+ li rRTN, 1
+ ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ bgtlr cr7
+ li rRTN, -1
+ blr
+ .align 4
+L(duLcr7):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+ li rRTN, 1
+ bgt cr7, L(dureturn29)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
+ li rRTN, -1
+ b L(dureturn27)
+ .align 4
+L(duLcr1):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+ li rRTN, 1
+ bgt cr1, L(dureturn29)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
+ li rRTN, -1
+ b L(dureturn27)
+ .align 4
+L(duLcr6):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+ li rRTN, 1
+ bgt cr6, L(dureturn29)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
+ li rRTN, -1
+ b L(dureturn27)
+ .align 4
+L(duLcr5):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+ li rRTN, 1
+ bgt cr5, L(dureturn29)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
+ li rRTN, -1
+ b L(dureturn27)
+
+ .align 3
+L(duZeroReturn):
+ li rRTN, 0
+ .align 4
+L(dureturn):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+L(dureturn29):
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
+L(dureturn27):
+ ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+ ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+ ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+L(dureturn24):
+ ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ blr
+
+L(duzeroLength):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 0
+ blr
+
+END (MEMCMP)
+libc_hidden_builtin_def (memcmp)
+weak_alias (memcmp, bcmp)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S
new file mode 100644
index 0000000000..bc734c9f4f
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S
@@ -0,0 +1,458 @@
+/* Optimized memset implementation for PowerPC64/POWER8.
+ Copyright (C) 2014-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#define MTVSRD_V1_R4 .long 0x7c240166 /* mtvsrd v1,r4 */
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+ Returns 's'. */
+
+#ifndef MEMSET
+# define MEMSET memset
+#endif
+
+ /* No need to use .machine power8 since mtvsrd is already
+ handled by the define. It avoid breakage on binutils
+ that does not support this machine specifier. */
+ .machine power7
+EALIGN (MEMSET, 5, 0)
+ CALL_MCOUNT 3
+
+L(_memset):
+ cmpldi cr7,r5,31
+ neg r0,r3
+ mr r10,r3
+
+ insrdi r4,r4,8,48
+ insrdi r4,r4,16,32 /* Replicate byte to word. */
+ ble cr7,L(write_LT_32)
+
+ andi. r11,r10,15 /* Check alignment of DST. */
+ insrdi r4,r4,32,0 /* Replicate word to double word. */
+
+ beq L(big_aligned)
+
+ mtocrf 0x01,r0
+ clrldi r0,r0,60
+
+ /* Get DST aligned to 16 bytes. */
+1: bf 31,2f
+ stb r4,0(r10)
+ addi r10,r10,1
+
+2: bf 30,4f
+ sth r4,0(r10)
+ addi r10,r10,2
+
+4: bf 29,8f
+ stw r4,0(r10)
+ addi r10,r10,4
+
+8: bf 28,16f
+ std r4,0(r10)
+ addi r10,r10,8
+
+16: subf r5,r0,r5
+
+ .align 4
+L(big_aligned):
+ /* For sizes larger than 255 two possible paths:
+ - if constant is '0', zero full cache lines with dcbz
+ - otherwise uses vector instructions. */
+ cmpldi cr5,r5,255
+ dcbtst 0,r10
+ cmpldi cr6,r4,0
+ crand 27,26,21
+ bt 27,L(huge_dcbz)
+ bge cr5,L(huge_vector)
+
+
+ /* Size between 32 and 255 bytes with constant different than 0, use
+ doubleword store instruction to achieve best throughput. */
+ srdi r8,r5,5
+ clrldi r11,r5,59
+ cmpldi cr6,r11,0
+ cmpdi r8,0
+ beq L(tail_bytes)
+ mtctr r8
+
+ /* Main aligned write loop, writes 32-bytes at a time. */
+ .align 4
+L(big_loop):
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ addi r10,r10,32
+ bdz L(tail_bytes)
+
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ addi r10,10,32
+ bdnz L(big_loop)
+
+ b L(tail_bytes)
+
+ /* Write remaining 1~31 bytes. */
+ .align 4
+L(tail_bytes):
+ beqlr cr6
+
+ srdi r7,r11,4
+ clrldi r8,r11,60
+ mtocrf 0x01,r7
+
+ .align 4
+ bf 31,8f
+ std r4,0(r10)
+ std r4,8(r10)
+ addi r10,r10,16
+
+ .align 4
+8: mtocrf 0x1,r8
+ bf 28,4f
+ std r4,0(r10)
+ addi r10,r10,8
+
+ .align 4
+4: bf 29,2f
+ stw 4,0(10)
+ addi 10,10,4
+
+ .align 4
+2: bf 30,1f
+ sth 4,0(10)
+ addi 10,10,2
+
+ .align 4
+1: bflr 31
+ stb 4,0(10)
+ blr
+
+ /* Size larger than 255 bytes with constant different than 0, use
+ vector instruction to achieve best throughput. */
+L(huge_vector):
+ /* Replicate set byte to quadword in VMX register. */
+ MTVSRD_V1_R4
+ xxpermdi 32,v0,v1,0
+ vspltb v2,v0,15
+
+ /* Main aligned write loop: 128 bytes at a time. */
+ li r6,16
+ li r7,32
+ li r8,48
+ mtocrf 0x02,r5
+ srdi r12,r5,7
+ cmpdi r12,0
+ beq L(aligned_tail)
+ mtctr r12
+ b L(aligned_128loop)
+
+ .align 4
+L(aligned_128loop):
+ stvx v2,0,r10
+ stvx v2,r10,r6
+ stvx v2,r10,r7
+ stvx v2,r10,r8
+ addi r10,r10,64
+ stvx v2,0,r10
+ stvx v2,r10,r6
+ stvx v2,r10,r7
+ stvx v2,r10,r8
+ addi r10,r10,64
+ bdnz L(aligned_128loop)
+
+ /* Write remaining 1~127 bytes. */
+L(aligned_tail):
+ mtocrf 0x01,r5
+ bf 25,32f
+ stvx v2,0,r10
+ stvx v2,r10,r6
+ stvx v2,r10,r7
+ stvx v2,r10,r8
+ addi r10,r10,64
+
+32: bf 26,16f
+ stvx v2,0,r10
+ stvx v2,r10,r6
+ addi r10,r10,32
+
+16: bf 27,8f
+ stvx v2,0,r10
+ addi r10,r10,16
+
+8: bf 28,4f
+ std r4,0(r10)
+ addi r10,r10,8
+
+ /* Copies 4~7 bytes. */
+4: bf 29,L(tail2)
+ stw r4,0(r10)
+ bf 30,L(tail5)
+ sth r4,4(r10)
+ bflr 31
+ stb r4,6(r10)
+ /* Return original DST pointer. */
+ blr
+
+ /* Special case when value is 0 and we have a long length to deal
+ with. Use dcbz to zero out a full cacheline of 128 bytes at a time.
+ Before using dcbz though, we need to get the destination 128-byte
+ aligned. */
+ .align 4
+L(huge_dcbz):
+ andi. r11,r10,127
+ neg r0,r10
+ beq L(huge_dcbz_aligned)
+
+ clrldi r0,r0,57
+ subf r5,r0,r5
+ srdi r0,r0,3
+ mtocrf 0x01,r0
+
+ /* Write 1~128 bytes until DST is aligned to 128 bytes. */
+8: bf 28,4f
+
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ std r4,32(r10)
+ std r4,40(r10)
+ std r4,48(r10)
+ std r4,56(r10)
+ addi r10,r10,64
+
+ .align 4
+4: bf 29,2f
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ addi r10,r10,32
+
+ .align 4
+2: bf 30,1f
+ std r4,0(r10)
+ std r4,8(r10)
+ addi r10,r10,16
+
+ .align 4
+1: bf 31,L(huge_dcbz_aligned)
+ std r4,0(r10)
+ addi r10,r10,8
+
+L(huge_dcbz_aligned):
+ /* Setup dcbz unroll offsets and count numbers. */
+ srdi r8,r5,9
+ clrldi r11,r5,55
+ cmpldi cr6,r11,0
+ li r9,128
+ cmpdi r8,0
+ beq L(huge_tail)
+ li r7,256
+ li r6,384
+ mtctr r8
+
+ .align 4
+L(huge_loop):
+ /* Sets 512 bytes to zero in each iteration, the loop unrolling shows
+ a throughput boost for large sizes (2048 bytes or higher). */
+ dcbz 0,r10
+ dcbz r9,r10
+ dcbz r7,r10
+ dcbz r6,r10
+ addi r10,r10,512
+ bdnz L(huge_loop)
+
+ beqlr cr6
+
+L(huge_tail):
+ srdi r6,r11,8
+ srdi r7,r11,4
+ clrldi r8,r11,4
+ cmpldi cr6,r8,0
+ mtocrf 0x01,r6
+
+ beq cr6,L(tail)
+
+ /* We have 1~511 bytes remaining. */
+ .align 4
+32: bf 31,16f
+ dcbz 0,r10
+ dcbz r9,r10
+ addi r10,r10,256
+
+ .align 4
+16: mtocrf 0x01,r7
+ bf 28,8f
+ dcbz 0,r10
+ addi r10,r10,128
+
+ .align 4
+8: bf 29,4f
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ std r4,32(r10)
+ std r4,40(r10)
+ std r4,48(r10)
+ std r4,56(r10)
+ addi r10,r10,64
+
+ .align 4
+4: bf 30,2f
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ addi r10,r10,32
+
+ .align 4
+2: bf 31,L(tail)
+ std r4,0(r10)
+ std r4,8(r10)
+ addi r10,r10,16
+ .align 4
+
+ /* Remaining 1~15 bytes. */
+L(tail):
+ mtocrf 0x01,r8
+
+ .align
+8: bf 28,4f
+ std r4,0(r10)
+ addi r10,r10,8
+
+ .align 4
+4: bf 29,2f
+ stw r4,0(r10)
+ addi r10,r10,4
+
+ .align 4
+2: bf 30,1f
+ sth r4,0(r10)
+ addi r10,r10,2
+
+ .align 4
+1: bflr 31
+ stb r4,0(r10)
+ blr
+
+ /* Handle short copies of 0~31 bytes. Best throughput is achieved
+ by just unrolling all operations. */
+ .align 4
+L(write_LT_32):
+ cmpldi cr6,5,8
+ mtocrf 0x01,r5
+ ble cr6,L(write_LE_8)
+
+ /* At least 9 bytes to go. */
+ neg r8,r4
+ andi. r0,r8,3
+ cmpldi cr1,r5,16
+ beq L(write_LT_32_aligned)
+
+ /* Force 4-byte alignment for SRC. */
+ mtocrf 0x01,r0
+ subf r5,r0,r5
+
+2: bf 30,1f
+ sth r4,0(r10)
+ addi r10,r10,2
+
+1: bf 31,L(end_4bytes_alignment)
+ stb r4,0(r10)
+ addi r10,r10,1
+
+ .align 4
+L(end_4bytes_alignment):
+ cmpldi cr1,r5,16
+ mtocrf 0x01,r5
+
+L(write_LT_32_aligned):
+ blt cr1,8f
+
+ stw r4,0(r10)
+ stw r4,4(r10)
+ stw r4,8(r10)
+ stw r4,12(r10)
+ addi r10,r10,16
+
+8: bf 28,L(tail4)
+ stw r4,0(r10)
+ stw r4,4(r10)
+ addi r10,r10,8
+
+ .align 4
+ /* Copies 4~7 bytes. */
+L(tail4):
+ bf 29,L(tail2)
+ stw r4,0(r10)
+ bf 30,L(tail5)
+ sth r4,4(r10)
+ bflr 31
+ stb r4,6(r10)
+ blr
+
+ .align 4
+ /* Copies 2~3 bytes. */
+L(tail2):
+ bf 30,1f
+ sth r4,0(r10)
+ bflr 31
+ stb r4,2(r10)
+ blr
+
+ .align 4
+L(tail5):
+ bflr 31
+ stb r4,4(r10)
+ blr
+
+ .align 4
+1: bflr 31
+ stb r4,0(r10)
+ blr
+
+ /* Handles copies of 0~8 bytes. */
+ .align 4
+L(write_LE_8):
+ bne cr6,L(tail4)
+
+ stw r4,0(r10)
+ stw r4,4(r10)
+ blr
+END_GEN_TB (MEMSET,TB_TOCLESS)
+libc_hidden_builtin_def (memset)
+
+/* Copied from bzero.S to prevent the linker from inserting a stub
+ between bzero and memset. */
+ENTRY (__bzero)
+ CALL_MCOUNT 3
+ mr r5,r4
+ li r4,0
+ b L(_memset)
+END (__bzero)
+#ifndef __bzero
+weak_alias (__bzero, bzero)
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/multiarch/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/multiarch/Implies
new file mode 100644
index 0000000000..1fc7b7cd39
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/multiarch/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power7/multiarch
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/stpcpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/stpcpy.S
new file mode 100644
index 0000000000..955e738cee
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/stpcpy.S
@@ -0,0 +1,24 @@
+/* Optimized stpcpy implementation for PowerPC64/POWER8.
+ Copyright (C) 2015-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define USE_AS_STPCPY
+#include <sysdeps/powerpc/powerpc64/power8/strcpy.S>
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/stpncpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/stpncpy.S
new file mode 100644
index 0000000000..c14d984dd0
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/stpncpy.S
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for PowerPC64/POWER8.
+ Copyright (C) 2015-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasecmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasecmp.S
new file mode 100644
index 0000000000..88b17a6eb1
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasecmp.S
@@ -0,0 +1,457 @@
+/* Optimized strcasecmp implementation for PowerPC64.
+ Copyright (C) 2016-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <locale-defines.h>
+
+/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) */
+
+#ifndef USE_AS_STRNCASECMP
+# define __STRCASECMP __strcasecmp
+# define STRCASECMP strcasecmp
+#else
+# define __STRCASECMP __strncasecmp
+# define STRCASECMP strncasecmp
+#endif
+/* Convert 16 bytes to lowercase and compare */
+#define TOLOWER() \
+ vaddubm v8, v4, v1; \
+ vaddubm v7, v4, v3; \
+ vcmpgtub v8, v8, v2; \
+ vsel v4, v7, v4, v8; \
+ vaddubm v8, v5, v1; \
+ vaddubm v7, v5, v3; \
+ vcmpgtub v8, v8, v2; \
+ vsel v5, v7, v5, v8; \
+ vcmpequb. v7, v5, v4;
+
+/*
+ * Get 16 bytes for unaligned case.
+ * reg1: Vector to hold next 16 bytes.
+ * reg2: Address to read from.
+ * reg3: Permute control vector.
+ * v8: Tmp vector used to mask unwanted bytes.
+ * v9: Tmp vector,0 when null is found on first 16 bytes
+ */
+#ifdef __LITTLE_ENDIAN__
+#define GET16BYTES(reg1, reg2, reg3) \
+ lvx reg1, 0, reg2; \
+ vspltisb v8, -1; \
+ vperm v8, v8, reg1, reg3; \
+ vcmpequb. v8, v0, v8; \
+ beq cr6, 1f; \
+ vspltisb v9, 0; \
+ b 2f; \
+ .align 4; \
+1: \
+ addi r6, reg2, 16; \
+ lvx v9, 0, r6; \
+2: \
+ vperm reg1, v9, reg1, reg3;
+#else
+#define GET16BYTES(reg1, reg2, reg3) \
+ lvx reg1, 0, reg2; \
+ vspltisb v8, -1; \
+ vperm v8, reg1, v8, reg3; \
+ vcmpequb. v8, v0, v8; \
+ beq cr6, 1f; \
+ vspltisb v9, 0; \
+ b 2f; \
+ .align 4; \
+1: \
+ addi r6, reg2, 16; \
+ lvx v9, 0, r6; \
+2: \
+ vperm reg1, reg1, v9, reg3;
+#endif
+
+/* Check null in v4, v5 and convert to lower. */
+#define CHECKNULLANDCONVERT() \
+ vcmpequb. v7, v0, v5; \
+ beq cr6, 3f; \
+ vcmpequb. v7, v0, v4; \
+ beq cr6, 3f; \
+ b L(null_found); \
+ .align 4; \
+3: \
+ TOLOWER()
+
+#ifdef _ARCH_PWR8
+# define VCLZD_V8_v7 vclzd v8, v7;
+# define MFVRD_R3_V1 mfvrd r3, v1;
+# define VSUBUDM_V9_V8 vsubudm v9, v9, v8;
+# define VPOPCNTD_V8_V8 vpopcntd v8, v8;
+# define VADDUQM_V7_V8 vadduqm v9, v7, v8;
+#else
+# define VCLZD_V8_v7 .long 0x11003fc2
+# define MFVRD_R3_V1 .long 0x7c230067
+# define VSUBUDM_V9_V8 .long 0x112944c0
+# define VPOPCNTD_V8_V8 .long 0x110047c3
+# define VADDUQM_V7_V8 .long 0x11274100
+#endif
+
+ .machine power7
+
+ENTRY (__STRCASECMP)
+#ifdef USE_AS_STRNCASECMP
+ CALL_MCOUNT 3
+#else
+ CALL_MCOUNT 2
+#endif
+#define rRTN r3 /* Return value */
+#define rSTR1 r10 /* 1st string */
+#define rSTR2 r4 /* 2nd string */
+#define rCHAR1 r6 /* Byte read from 1st string */
+#define rCHAR2 r7 /* Byte read from 2nd string */
+#define rADDR1 r8 /* Address of tolower(rCHAR1) */
+#define rADDR2 r12 /* Address of tolower(rCHAR2) */
+#define rLWR1 r8 /* Word tolower(rCHAR1) */
+#define rLWR2 r12 /* Word tolower(rCHAR2) */
+#define rTMP r9
+#define rLOC r11 /* Default locale address */
+
+ cmpd cr7, rRTN, rSTR2
+
+ /* Get locale address. */
+ ld rTMP, __libc_tsd_LOCALE@got@tprel(r2)
+ add rLOC, rTMP, __libc_tsd_LOCALE@tls
+ ld rLOC, 0(rLOC)
+
+ mr rSTR1, rRTN
+ li rRTN, 0
+ beqlr cr7
+#ifdef USE_AS_STRNCASECMP
+ cmpdi cr7, r5, 0
+ beq cr7, L(retnull)
+ cmpdi cr7, r5, 16
+ blt cr7, L(bytebybyte)
+#endif
+ vspltisb v0, 0
+ vspltisb v8, -1
+ /* Check for null in initial characters.
+ Check max of 16 char depending on the alignment.
+ If null is present, proceed byte by byte. */
+ lvx v4, 0, rSTR1
+#ifdef __LITTLE_ENDIAN__
+ lvsr v10, 0, rSTR1 /* Compute mask. */
+ vperm v9, v8, v4, v10 /* Mask bits that are not part of string. */
+#else
+ lvsl v10, 0, rSTR1
+ vperm v9, v4, v8, v10
+#endif
+ vcmpequb. v9, v0, v9 /* Check for null bytes. */
+ bne cr6, L(bytebybyte)
+ lvx v5, 0, rSTR2
+ /* Calculate alignment. */
+#ifdef __LITTLE_ENDIAN__
+ lvsr v6, 0, rSTR2
+ vperm v9, v8, v5, v6 /* Mask bits that are not part of string. */
+#else
+ lvsl v6, 0, rSTR2
+ vperm v9, v5, v8, v6
+#endif
+ vcmpequb. v9, v0, v9 /* Check for null bytes. */
+ bne cr6, L(bytebybyte)
+ /* Check if locale has non ascii characters. */
+ ld rTMP, 0(rLOC)
+ addi r6, rTMP,LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
+ lwz rTMP, 0(r6)
+ cmpdi cr7, rTMP, 1
+ beq cr7, L(bytebybyte)
+
+ /* Load vector registers with values used for TOLOWER. */
+ /* Load v1 = 0xbf, v2 = 0x19 v3 = 0x20 in each byte. */
+ vspltisb v3, 2
+ vspltisb v9, 4
+ vsl v3, v3, v9
+ vaddubm v1, v3, v3
+ vnor v1, v1, v1
+ vspltisb v2, 7
+ vsububm v2, v3, v2
+
+ andi. rADDR1, rSTR1, 0xF
+ beq cr0, L(align)
+ addi r6, rSTR1, 16
+ lvx v9, 0, r6
+ /* Compute 16 bytes from previous two loads. */
+#ifdef __LITTLE_ENDIAN__
+ vperm v4, v9, v4, v10
+#else
+ vperm v4, v4, v9, v10
+#endif
+L(align):
+ andi. rADDR2, rSTR2, 0xF
+ beq cr0, L(align1)
+ addi r6, rSTR2, 16
+ lvx v9, 0, r6
+ /* Compute 16 bytes from previous two loads. */
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v9, v5, v6
+#else
+ vperm v5, v5, v9, v6
+#endif
+L(align1):
+ CHECKNULLANDCONVERT()
+ blt cr6, L(match)
+ b L(different)
+ .align 4
+L(match):
+ clrldi r6, rSTR1, 60
+ subfic r7, r6, 16
+#ifdef USE_AS_STRNCASECMP
+ sub r5, r5, r7
+#endif
+ add rSTR1, rSTR1, r7
+ add rSTR2, rSTR2, r7
+ andi. rADDR2, rSTR2, 0xF
+ addi rSTR1, rSTR1, -16
+ addi rSTR2, rSTR2, -16
+ beq cr0, L(aligned)
+#ifdef __LITTLE_ENDIAN__
+ lvsr v6, 0, rSTR2
+#else
+ lvsl v6, 0, rSTR2
+#endif
+ /* There are 2 loops depending on the input alignment.
+ Each loop gets 16 bytes from s1 and s2, check for null,
+ convert to lowercase and compare. Loop till difference
+ or null occurs. */
+L(s1_align):
+ addi rSTR1, rSTR1, 16
+ addi rSTR2, rSTR2, 16
+#ifdef USE_AS_STRNCASECMP
+ cmpdi cr7, r5, 16
+ blt cr7, L(bytebybyte)
+ addi r5, r5, -16
+#endif
+ lvx v4, 0, rSTR1
+ GET16BYTES(v5, rSTR2, v6)
+ CHECKNULLANDCONVERT()
+ blt cr6, L(s1_align)
+ b L(different)
+ .align 4
+L(aligned):
+ addi rSTR1, rSTR1, 16
+ addi rSTR2, rSTR2, 16
+#ifdef USE_AS_STRNCASECMP
+ cmpdi cr7, r5, 16
+ blt cr7, L(bytebybyte)
+ addi r5, r5, -16
+#endif
+ lvx v4, 0, rSTR1
+ lvx v5, 0, rSTR2
+ CHECKNULLANDCONVERT()
+ blt cr6, L(aligned)
+
+ /* Calculate and return the difference. */
+L(different):
+ vaddubm v1, v3, v3
+ vcmpequb v7, v0, v7
+#ifdef __LITTLE_ENDIAN__
+ /* Count trailing zero. */
+ vspltisb v8, -1
+ VADDUQM_V7_V8
+ vandc v8, v9, v7
+ VPOPCNTD_V8_V8
+ vspltb v6, v8, 15
+ vcmpequb. v6, v6, v1
+ blt cr6, L(shift8)
+#else
+ /* Count leading zero. */
+ VCLZD_V8_v7
+ vspltb v6, v8, 7
+ vcmpequb. v6, v6, v1
+ blt cr6, L(shift8)
+ vsro v8, v8, v1
+#endif
+ b L(skipsum)
+ .align 4
+L(shift8):
+ vsumsws v8, v8, v0
+L(skipsum):
+#ifdef __LITTLE_ENDIAN__
+ /* Shift registers based on leading zero count. */
+ vsro v6, v5, v8
+ vsro v7, v4, v8
+ /* Merge and move to GPR. */
+ vmrglb v6, v6, v7
+ vslo v1, v6, v1
+ MFVRD_R3_V1
+ /* Place the characters that are different in first position. */
+ sldi rSTR2, rRTN, 56
+ srdi rSTR2, rSTR2, 56
+ sldi rSTR1, rRTN, 48
+ srdi rSTR1, rSTR1, 56
+#else
+ vslo v6, v5, v8
+ vslo v7, v4, v8
+ vmrghb v1, v6, v7
+ MFVRD_R3_V1
+ srdi rSTR2, rRTN, 48
+ sldi rSTR2, rSTR2, 56
+ srdi rSTR2, rSTR2, 56
+ srdi rSTR1, rRTN, 56
+#endif
+ subf rRTN, rSTR1, rSTR2
+ extsw rRTN, rRTN
+ blr
+
+ .align 4
+ /* OK. We've hit the end of the string. We need to be careful that
+ we don't compare two strings as different because of junk beyond
+ the end of the strings... */
+L(null_found):
+ vaddubm v10, v3, v3
+#ifdef __LITTLE_ENDIAN__
+ /* Count trailing zero. */
+ vspltisb v8, -1
+ VADDUQM_V7_V8
+ vandc v8, v9, v7
+ VPOPCNTD_V8_V8
+ vspltb v6, v8, 15
+ vcmpequb. v6, v6, v10
+ blt cr6, L(shift_8)
+#else
+ /* Count leading zero. */
+ VCLZD_V8_v7
+ vspltb v6, v8, 7
+ vcmpequb. v6, v6, v10
+ blt cr6, L(shift_8)
+ vsro v8, v8, v10
+#endif
+ b L(skipsum1)
+ .align 4
+L(shift_8):
+ vsumsws v8, v8, v0
+L(skipsum1):
+ /* Calculate shift count based on count of zero. */
+ vspltisb v10, 7
+ vslb v10, v10, v10
+ vsldoi v9, v0, v10, 1
+ VSUBUDM_V9_V8
+ vspltisb v8, 8
+ vsldoi v8, v0, v8, 1
+ VSUBUDM_V9_V8
+ /* Shift and remove junk after null character. */
+#ifdef __LITTLE_ENDIAN__
+ vslo v5, v5, v9
+ vslo v4, v4, v9
+#else
+ vsro v5, v5, v9
+ vsro v4, v4, v9
+#endif
+ /* Convert and compare 16 bytes. */
+ TOLOWER()
+ blt cr6, L(retnull)
+ b L(different)
+ .align 4
+L(retnull):
+ li rRTN, 0
+ blr
+ .align 4
+L(bytebybyte):
+ /* Unrolling loop for POWER: loads are done with 'lbz' plus
+ offset and string descriptors are only updated in the end
+ of loop unrolling. */
+ ld rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
+ lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
+ lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
+#ifdef USE_AS_STRNCASECMP
+ rldicl rTMP, r5, 62, 2
+ cmpdi cr7, rTMP, 0
+ beq cr7, L(lessthan4)
+ mtctr rTMP
+#endif
+L(loop):
+ cmpdi rCHAR1, 0 /* *s1 == '\0' ? */
+ sldi rADDR1, rCHAR1, 2 /* Calculate address for tolower(*s1) */
+ sldi rADDR2, rCHAR2, 2 /* Calculate address for tolower(*s2) */
+ lwzx rLWR1, rLOC, rADDR1 /* Load tolower(*s1) */
+ lwzx rLWR2, rLOC, rADDR2 /* Load tolower(*s2) */
+ cmpw cr1, rLWR1, rLWR2 /* r = tolower(*s1) == tolower(*s2) ? */
+ crorc 4*cr1+eq,eq,4*cr1+eq /* (*s1 != '\0') || (r == 1) */
+ beq cr1, L(done)
+ lbz rCHAR1, 1(rSTR1)
+ lbz rCHAR2, 1(rSTR2)
+ cmpdi rCHAR1, 0
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpw cr1, rLWR1, rLWR2
+ crorc 4*cr1+eq,eq,4*cr1+eq
+ beq cr1, L(done)
+ lbz rCHAR1, 2(rSTR1)
+ lbz rCHAR2, 2(rSTR2)
+ cmpdi rCHAR1, 0
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpw cr1, rLWR1, rLWR2
+ crorc 4*cr1+eq,eq,4*cr1+eq
+ beq cr1, L(done)
+ lbz rCHAR1, 3(rSTR1)
+ lbz rCHAR2, 3(rSTR2)
+ cmpdi rCHAR1, 0
+ /* Increment both string descriptors */
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpw cr1, rLWR1, rLWR2
+ crorc 4*cr1+eq,eq,4*cr1+eq
+ beq cr1, L(done)
+ lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
+ lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
+#ifdef USE_AS_STRNCASECMP
+ bdnz L(loop)
+#else
+ b L(loop)
+#endif
+#ifdef USE_AS_STRNCASECMP
+L(lessthan4):
+ clrldi r5, r5, 62
+ cmpdi cr7, r5, 0
+ beq cr7, L(retnull)
+ mtctr r5
+L(loop1):
+ cmpdi rCHAR1, 0
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpw cr1, rLWR1, rLWR2
+ crorc 4*cr1+eq,eq,4*cr1+eq
+ beq cr1, L(done)
+ addi rSTR1, rSTR1, 1
+ addi rSTR2, rSTR2, 1
+ lbz rCHAR1, 0(rSTR1)
+ lbz rCHAR2, 0(rSTR2)
+ bdnz L(loop1)
+#endif
+L(done):
+ subf r0, rLWR2, rLWR1
+ extsw rRTN, r0
+ blr
+END (__STRCASECMP)
+
+weak_alias (__STRCASECMP, STRCASECMP)
+libc_hidden_builtin_def (__STRCASECMP)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c
new file mode 100644
index 0000000000..0e746b7718
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c
@@ -0,0 +1,29 @@
+/* Optimized strcasestr implementation for PowerPC64/POWER8.
+ Copyright (C) 2016-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <string.h>
+
+#define STRCASESTR __strcasestr_ppc
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(__name)
+
+#undef weak_alias
+#define weak_alias(a,b)
+extern __typeof (strcasestr) __strcasestr_ppc attribute_hidden;
+
+#include <string/strcasestr.c>
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasestr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasestr.S
new file mode 100644
index 0000000000..6ac6572f3b
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasestr.S
@@ -0,0 +1,538 @@
+/* Optimized strcasestr implementation for PowerPC64/POWER8.
+ Copyright (C) 2016-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <locale-defines.h>
+
+/* Char * [r3] strcasestr (char *s [r3], char * pat[r4]) */
+
+/* The performance gain is obtained by comparing 16 bytes. */
+
+/* When the first char of r4 is hit ITERATIONS times in r3
+ fallback to default. */
+#define ITERATIONS 64
+
+#ifndef STRCASESTR
+# define STRCASESTR __strcasestr
+#endif
+
+#ifndef STRLEN
+/* For builds without IFUNC support, local calls should be made to internal
+ GLIBC symbol (created by libc_hidden_builtin_def). */
+# ifdef SHARED
+# define STRLEN __GI_strlen
+# else
+# define STRLEN strlen
+# endif
+#endif
+
+#ifndef STRNLEN
+/* For builds without IFUNC support, local calls should be made to internal
+ GLIBC symbol (created by libc_hidden_builtin_def). */
+# ifdef SHARED
+# define STRNLEN __GI_strnlen
+# else
+# define STRNLEN __strnlen
+# endif
+#endif
+
+#ifndef STRCHR
+# ifdef SHARED
+# define STRCHR __GI_strchr
+# else
+# define STRCHR strchr
+# endif
+#endif
+
+/* Convert 16 bytes of v4 and reg to lowercase and compare. */
+#define TOLOWER(reg) \
+ vcmpgtub v6, v4, v1; \
+ vcmpgtub v7, v2, v4; \
+ vand v8, v7, v6; \
+ vand v8, v8, v3; \
+ vor v4, v8, v4; \
+ vcmpgtub v6, reg, v1; \
+ vcmpgtub v7, v2, reg; \
+ vand v8, v7, v6; \
+ vand v8, v8, v3; \
+ vor reg, v8, reg; \
+ vcmpequb. v6, reg, v4;
+
+/* TODO: change these to the actual instructions when the minimum required
+ binutils allows it. */
+#ifdef _ARCH_PWR8
+#define VCLZD_V8_v7 vclzd v8, v7;
+#else
+#define VCLZD_V8_v7 .long 0x11003fc2
+#endif
+
+#define FRAMESIZE (FRAME_MIN_SIZE+48)
+/* TODO: change this to .machine power8 when the minimum required binutils
+ allows it. */
+ .machine power7
+EALIGN (STRCASESTR, 4, 0)
+ CALL_MCOUNT 2
+ mflr r0 /* Load link register LR to r0. */
+ std r31, -8(r1) /* Save callers register r31. */
+ std r30, -16(r1) /* Save callers register r30. */
+ std r29, -24(r1) /* Save callers register r29. */
+ std r28, -32(r1) /* Save callers register r28. */
+ std r27, -40(r1) /* Save callers register r27. */
+ std r0, 16(r1) /* Store the link register. */
+ cfi_offset(r31, -8)
+ cfi_offset(r30, -16)
+ cfi_offset(r29, -24)
+ cfi_offset(r28, -32)
+ cfi_offset(r27, -40)
+ cfi_offset(lr, 16)
+ stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */
+ cfi_adjust_cfa_offset(FRAMESIZE)
+
+ dcbt 0, r3
+ dcbt 0, r4
+ cmpdi cr7, r3, 0 /* Input validation. */
+ beq cr7, L(retnull)
+ cmpdi cr7, r4, 0
+ beq cr7, L(retnull)
+
+ mr r29, r3
+ mr r30, r4
+ /* Load first byte from r4 and check if its null. */
+ lbz r6, 0(r4)
+ cmpdi cr7, r6, 0
+ beq cr7, L(ret_r3)
+
+ ld r10, __libc_tsd_LOCALE@got@tprel(r2)
+ add r9, r10, __libc_tsd_LOCALE@tls
+ ld r9, 0(r9)
+ ld r9, LOCALE_CTYPE_TOUPPER(r9)
+ sldi r10, r6, 2 /* Convert to upper case. */
+ lwzx r28, r9, r10
+
+ ld r10, __libc_tsd_LOCALE@got@tprel(r2)
+ add r11, r10, __libc_tsd_LOCALE@tls
+ ld r11, 0(r11)
+ ld r11, LOCALE_CTYPE_TOLOWER(r11)
+ sldi r10, r6, 2 /* Convert to lower case. */
+ lwzx r27, r11, r10
+
+ /* Check if the first char is present. */
+ mr r4, r27
+ bl STRCHR
+ nop
+ mr r5, r3
+ mr r3, r29
+ mr r29, r5
+ mr r4, r28
+ bl STRCHR
+ nop
+ cmpdi cr7, r29, 0
+ beq cr7, L(firstpos)
+ cmpdi cr7, r3, 0
+ beq cr7, L(skipcheck)
+ cmpw cr7, r3, r29
+ ble cr7, L(firstpos)
+ /* Move r3 to the first occurence. */
+L(skipcheck):
+ mr r3, r29
+L(firstpos):
+ mr r29, r3
+
+ sldi r9, r27, 8
+ or r28, r9, r28
+ /* Reg r27 is used to count the number of iterations. */
+ li r27, 0
+ /* If first char of search str is not present. */
+ cmpdi cr7, r3, 0
+ ble cr7, L(end)
+
+ /* Find the length of pattern. */
+ mr r3, r30
+ bl STRLEN
+ nop
+
+ cmpdi cr7, r3, 0 /* If search str is null. */
+ beq cr7, L(ret_r3)
+
+ mr r31, r3
+ mr r4, r3
+ mr r3, r29
+ bl STRNLEN
+ nop
+
+ cmpd cr7, r3, r31 /* If len(r3) < len(r4). */
+ blt cr7, L(retnull)
+
+ mr r3, r29
+
+ /* Locales not matching ASCII for single bytes. */
+ ld r10, __libc_tsd_LOCALE@got@tprel(r2)
+ add r9, r10, __libc_tsd_LOCALE@tls
+ ld r9, 0(r9)
+ ld r7, 0(r9)
+ addi r7, r7, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
+ lwz r8, 0(r7)
+ cmpdi cr7, r8, 1
+ beq cr7, L(bytebybyte)
+
+ /* If len(r4) < 16 handle byte by byte. */
+ /* For shorter strings we will not use vector registers. */
+ cmpdi cr7, r31, 16
+ blt cr7, L(bytebybyte)
+
+ /* Comparison values used for TOLOWER. */
+ /* Load v1 = 64('A' - 1), v2 = 91('Z' + 1), v3 = 32 in each byte. */
+ vspltish v0, 0
+ vspltisb v5, 2
+ vspltisb v4, 4
+ vsl v3, v5, v4
+ vaddubm v1, v3, v3
+ vspltisb v5, 15
+ vaddubm v2, v5, v5
+ vaddubm v2, v1, v2
+ vspltisb v4, -3
+ vaddubm v2, v2, v4
+
+ /*
+ 1. Load 16 bytes from r3 and r4
+ 2. Check if there is null, If yes, proceed byte by byte path.
+ 3. Else,Convert both to lowercase and compare.
+ 4. If they are same proceed to 1.
+ 5. If they dont match, find if first char of r4 is present in the
+ loaded 16 byte of r3.
+ 6. If yes, move position, load next 16 bytes of r3 and proceed to 2.
+ */
+
+ mr r8, r3 /* Save r3 for future use. */
+ mr r4, r30 /* Restore r4. */
+ clrldi r10, r4, 60
+ lvx v5, 0, r4 /* Load 16 bytes from r4. */
+ cmpdi cr7, r10, 0
+ beq cr7, L(begin2)
+ /* If r4 is unaligned, load another 16 bytes. */
+#ifdef __LITTLE_ENDIAN__
+ lvsr v7, 0, r4
+#else
+ lvsl v7, 0, r4
+#endif
+ addi r5, r4, 16
+ lvx v9, 0, r5
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v9, v5, v7
+#else
+ vperm v5, v5, v9, v7
+#endif
+L(begin2):
+ lvx v4, 0, r3
+ vcmpequb. v7, v0, v4 /* Check for null. */
+ beq cr6, L(nullchk6)
+ b L(trailcheck)
+
+ .align 4
+L(nullchk6):
+ clrldi r10, r3, 60
+ cmpdi cr7, r10, 0
+ beq cr7, L(next16)
+#ifdef __LITTLE_ENDIAN__
+ lvsr v7, 0, r3
+#else
+ lvsl v7, 0, r3
+#endif
+ addi r5, r3, 16
+ /* If r3 is unaligned, load another 16 bytes. */
+ lvx v10, 0, r5
+#ifdef __LITTLE_ENDIAN__
+ vperm v4, v10, v4, v7
+#else
+ vperm v4, v4, v10, v7
+#endif
+L(next16):
+ vcmpequb. v6, v0, v5 /* Check for null. */
+ beq cr6, L(nullchk)
+ b L(trailcheck)
+
+ .align 4
+L(nullchk):
+ vcmpequb. v6, v0, v4
+ beq cr6, L(nullchk1)
+ b L(retnull)
+
+ .align 4
+L(nullchk1):
+ /* Convert both v3 and v4 to lower. */
+ TOLOWER(v5)
+ /* If both are same, branch to match. */
+ blt cr6, L(match)
+ /* Find if the first char is present in next 15 bytes. */
+#ifdef __LITTLE_ENDIAN__
+ vspltb v6, v5, 15
+ vsldoi v7, v0, v4, 15
+#else
+ vspltb v6, v5, 0
+ vspltisb v7, 8
+ vslo v7, v4, v7
+#endif
+ vcmpequb v7, v6, v7
+ vcmpequb. v6, v0, v7
+ /* Shift r3 by 16 bytes and proceed. */
+ blt cr6, L(shift16)
+ VCLZD_V8_v7
+#ifdef __LITTLE_ENDIAN__
+ vspltb v6, v8, 15
+#else
+ vspltb v6, v8, 7
+#endif
+ vcmpequb. v6, v6, v1
+ /* Shift r3 by 8 bytes and proceed. */
+ blt cr6, L(shift8)
+ b L(begin)
+
+ .align 4
+L(match):
+ /* There is a match of 16 bytes, check next bytes. */
+ cmpdi cr7, r31, 16
+ mr r29, r3
+ beq cr7, L(ret_r3)
+
+L(secondmatch):
+ addi r3, r3, 16
+ addi r4, r4, 16
+ /* Load next 16 bytes of r3 and r4 and compare. */
+ clrldi r10, r4, 60
+ cmpdi cr7, r10, 0
+ beq cr7, L(nextload)
+ /* Handle unaligned case. */
+ vor v6, v9, v9
+ vcmpequb. v7, v0, v6
+ beq cr6, L(nullchk2)
+ b L(trailcheck)
+
+ .align 4
+L(nullchk2):
+#ifdef __LITTLE_ENDIAN__
+ lvsr v7, 0, r4
+#else
+ lvsl v7, 0, r4
+#endif
+ addi r5, r4, 16
+ /* If r4 is unaligned, load another 16 bytes. */
+ lvx v9, 0, r5
+#ifdef __LITTLE_ENDIAN__
+ vperm v11, v9, v6, v7
+#else
+ vperm v11, v6, v9, v7
+#endif
+ b L(compare)
+
+ .align 4
+L(nextload):
+ lvx v11, 0, r4
+L(compare):
+ vcmpequb. v7, v0, v11
+ beq cr6, L(nullchk3)
+ b L(trailcheck)
+
+ .align 4
+L(nullchk3):
+ clrldi r10, r3, 60
+ cmpdi cr7, r10, 0
+ beq cr7, L(nextload1)
+ /* Handle unaligned case. */
+ vor v4, v10, v10
+ vcmpequb. v7, v0, v4
+ beq cr6, L(nullchk4)
+ b L(retnull)
+
+ .align 4
+L(nullchk4):
+#ifdef __LITTLE_ENDIAN__
+ lvsr v7, 0, r3
+#else
+ lvsl v7, 0, r3
+#endif
+ addi r5, r3, 16
+ /* If r3 is unaligned, load another 16 bytes. */
+ lvx v10, 0, r5
+#ifdef __LITTLE_ENDIAN__
+ vperm v4, v10, v4, v7
+#else
+ vperm v4, v4, v10, v7
+#endif
+ b L(compare1)
+
+ .align 4
+L(nextload1):
+ lvx v4, 0, r3
+L(compare1):
+ vcmpequb. v7, v0, v4
+ beq cr6, L(nullchk5)
+ b L(retnull)
+
+ .align 4
+L(nullchk5):
+ /* Convert both v3 and v4 to lower. */
+ TOLOWER(v11)
+ /* If both are same, branch to secondmatch. */
+ blt cr6, L(secondmatch)
+ /* Continue the search. */
+ b L(begin)
+
+ .align 4
+L(trailcheck):
+ ld r10, __libc_tsd_LOCALE@got@tprel(r2)
+ add r11, r10, __libc_tsd_LOCALE@tls
+ ld r11, 0(r11)
+ ld r11, LOCALE_CTYPE_TOLOWER(r11)
+L(loop2):
+ lbz r5, 0(r3) /* Load byte from r3. */
+ lbz r6, 0(r4) /* Load next byte from r4. */
+ cmpdi cr7, r6, 0 /* Is it null? */
+ beq cr7, L(updater3)
+ cmpdi cr7, r5, 0 /* Is it null? */
+ beq cr7, L(retnull) /* If yes, return. */
+ addi r3, r3, 1
+ addi r4, r4, 1 /* Increment r4. */
+ sldi r10, r5, 2 /* Convert to lower case. */
+ lwzx r10, r11, r10
+ sldi r7, r6, 2 /* Convert to lower case. */
+ lwzx r7, r11, r7
+ cmpw cr7, r7, r10 /* Compare with byte from r4. */
+ bne cr7, L(begin)
+ b L(loop2)
+
+ .align 4
+L(shift8):
+ addi r8, r8, 7
+ b L(begin)
+ .align 4
+L(shift16):
+ addi r8, r8, 15
+ .align 4
+L(begin):
+ addi r8, r8, 1
+ mr r3, r8
+ /* When our iterations exceed ITERATIONS,fall back to default. */
+ addi r27, r27, 1
+ cmpdi cr7, r27, ITERATIONS
+ beq cr7, L(default)
+ mr r4, r30 /* Restore r4. */
+ b L(begin2)
+
+ /* Handling byte by byte. */
+ .align 4
+L(loop1):
+ mr r3, r8
+ addi r27, r27, 1
+ cmpdi cr7, r27, ITERATIONS
+ beq cr7, L(default)
+ mr r29, r8
+ srdi r4, r28, 8
+ /* Check if the first char is present. */
+ bl STRCHR
+ nop
+ mr r5, r3
+ mr r3, r29
+ mr r29, r5
+ sldi r4, r28, 56
+ srdi r4, r4, 56
+ bl STRCHR
+ nop
+ cmpdi cr7, r29, 0
+ beq cr7, L(nextpos)
+ cmpdi cr7, r3, 0
+ beq cr7, L(skipcheck1)
+ cmpw cr7, r3, r29
+ ble cr7, L(nextpos)
+ /* Move r3 to first occurence. */
+L(skipcheck1):
+ mr r3, r29
+L(nextpos):
+ mr r29, r3
+ cmpdi cr7, r3, 0
+ ble cr7, L(retnull)
+L(bytebybyte):
+ ld r10, __libc_tsd_LOCALE@got@tprel(r2)
+ add r11, r10, __libc_tsd_LOCALE@tls
+ ld r11, 0(r11)
+ ld r11, LOCALE_CTYPE_TOLOWER(r11)
+ mr r4, r30 /* Restore r4. */
+ mr r8, r3 /* Save r3. */
+ addi r8, r8, 1
+
+L(loop):
+ addi r3, r3, 1
+ lbz r5, 0(r3) /* Load byte from r3. */
+ addi r4, r4, 1 /* Increment r4. */
+ lbz r6, 0(r4) /* Load next byte from r4. */
+ cmpdi cr7, r6, 0 /* Is it null? */
+ beq cr7, L(updater3)
+ cmpdi cr7, r5, 0 /* Is it null? */
+ beq cr7, L(retnull) /* If yes, return. */
+ sldi r10, r5, 2 /* Convert to lower case. */
+ lwzx r10, r11, r10
+ sldi r7, r6, 2 /* Convert to lower case. */
+ lwzx r7, r11, r7
+ cmpw cr7, r7, r10 /* Compare with byte from r4. */
+ bne cr7, L(loop1)
+ b L(loop)
+
+ /* Handling return values. */
+ .align 4
+L(updater3):
+ subf r3, r31, r3 /* Reduce r31 (len of r4) from r3. */
+ b L(end)
+
+ .align 4
+L(ret_r3):
+ mr r3, r29 /* Return point of match. */
+ b L(end)
+
+ .align 4
+L(retnull):
+ li r3, 0 /* Substring was not found. */
+ b L(end)
+
+ .align 4
+L(default):
+ mr r4, r30
+ bl __strcasestr_ppc
+ nop
+
+ .align 4
+L(end):
+ addi r1, r1, FRAMESIZE /* Restore stack pointer. */
+ cfi_adjust_cfa_offset(-FRAMESIZE)
+ ld r0, 16(r1) /* Restore the saved link register. */
+ ld r27, -40(r1)
+ ld r28, -32(r1)
+ ld r29, -24(r1) /* Restore callers save register r29. */
+ ld r30, -16(r1) /* Restore callers save register r30. */
+ ld r31, -8(r1) /* Restore callers save register r31. */
+ cfi_restore(lr)
+ cfi_restore(r27)
+ cfi_restore(r28)
+ cfi_restore(r29)
+ cfi_restore(r30)
+ cfi_restore(r31)
+ mtlr r0 /* Branch to link register. */
+ blr
+END (STRCASESTR)
+
+weak_alias (__strcasestr, strcasestr)
+libc_hidden_def (__strcasestr)
+libc_hidden_builtin_def (strcasestr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strchr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strchr.S
new file mode 100644
index 0000000000..e0c185c162
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strchr.S
@@ -0,0 +1,377 @@
+/* Optimized strchr implementation for PowerPC64/POWER8.
+ Copyright (C) 2016-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STRCHRNUL
+# ifndef STRCHRNUL
+# define FUNC_NAME __strchrnul
+# else
+# define FUNC_NAME STRCHRNUL
+# endif
+#else
+# ifndef STRCHR
+# define FUNC_NAME strchr
+# else
+# define FUNC_NAME STRCHR
+# endif
+#endif /* !USE_AS_STRCHRNUL */
+
+/* int [r3] strchr (char *s [r3], int c [r4]) */
+/* TODO: change these to the actual instructions when the minimum required
+ binutils allows it. */
+#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t,a,b) .long (0x1000054c \
+ | ((t)<<(32-11)) \
+ | ((a)<<(32-16)) \
+ | ((b)<<(32-21)) )
+/* TODO: change this to .machine power8 when the minimum required binutils
+ allows it. */
+ .machine power7
+ENTRY (FUNC_NAME)
+ CALL_MCOUNT 2
+ dcbt 0,r3
+ clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
+ cmpdi cr7,r4,0
+ ld r12,0(r8) /* Load doubleword from memory. */
+ li r0,0 /* Doubleword with null chars to use
+ with cmpb. */
+
+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
+
+ beq cr7,L(null_match)
+
+ /* Replicate byte to doubleword. */
+ insrdi r4,r4,8,48
+ insrdi r4,r4,16,32
+ insrdi r4,r4,32,0
+
+ /* Now r4 has a doubleword of c bytes and r0 has
+ a doubleword of null bytes. */
+
+ cmpb r10,r12,r4 /* Compare each byte against c byte. */
+ cmpb r11,r12,r0 /* Compare each byte against null byte. */
+
+ /* Move the doublewords left and right to discard the bits that are
+ not part of the string and bring them back as zeros. */
+#ifdef __LITTLE_ENDIAN__
+ srd r10,r10,r6
+ srd r11,r11,r6
+ sld r10,r10,r6
+ sld r11,r11,r6
+#else
+ sld r10,r10,r6
+ sld r11,r11,r6
+ srd r10,r10,r6
+ srd r11,r11,r6
+#endif
+ or r5,r10,r11 /* OR the results to speed things up. */
+ cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
+ have been found. */
+ bne cr7,L(done)
+
+ mtcrf 0x01,r8
+
+ /* Are we now aligned to a doubleword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bt 28,L(loop)
+
+ /* Handle WORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r10,r12,r4
+ cmpb r11,r12,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ bne cr7,L(done)
+ b L(loop) /* We branch here (rather than falling through)
+ to skip the nops due to heavy alignment
+ of the loop below. */
+
+ .p2align 5
+L(loop):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+ ld r12,8(r8)
+ ldu r9,16(r8)
+ cmpb r10,r12,r4
+ cmpb r11,r12,r0
+ cmpb r6,r9,r4
+ cmpb r7,r9,r0
+ or r5,r10,r11
+ or r9,r6,r7
+ or r12,r5,r9
+ cmpdi cr7,r12,0
+ beq cr7,L(vector)
+ /* OK, one (or both) of the doublewords contains a c/null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a c/null byte. */
+
+ cmpdi cr6,r5,0
+ addi r8,r8,-8
+ bne cr6,L(done)
+
+ /* The c/null byte must be in the second doubleword. Adjust the
+ address again and move the result of cmpb to r10 so we can calculate
+ the pointer. */
+
+ mr r10,r6
+ mr r11,r7
+ addi r8,r8,8
+#ifdef USE_AS_STRCHRNUL
+ mr r5, r9
+#endif
+ /* r10/r11 have the output of the cmpb instructions, that is,
+ 0xff in the same position as the c/null byte in the original
+ doubleword from the string. Use that to calculate the pointer. */
+L(done):
+#ifdef USE_AS_STRCHRNUL
+ mr r10, r5
+#endif
+#ifdef __LITTLE_ENDIAN__
+ addi r3,r10,-1
+ andc r3,r3,r10
+ popcntd r0,r3
+# ifndef USE_AS_STRCHRNUL
+ addi r4,r11,-1
+ andc r4,r4,r11
+ cmpld cr7,r3,r4
+ bgt cr7,L(no_match)
+# endif
+#else
+ cntlzd r0,r10 /* Count leading zeros before c matches. */
+# ifndef USE_AS_STRCHRNUL
+ cmpld cr7,r11,r10
+ bgt cr7,L(no_match)
+# endif
+#endif
+ srdi r0,r0,3 /* Convert leading zeros to bytes. */
+ add r3,r8,r0 /* Return address of the matching c byte
+ or null in case c was not found. */
+ blr
+
+ /* Check the first 32B in GPR's and move to vectorized loop. */
+ .p2align 5
+L(vector):
+ addi r3, r8, 8
+ andi. r10, r3, 31
+ bne cr0, L(loop)
+ vspltisb v0, 0
+ /* Precompute vbpermq constant. */
+ vspltisb v10, 3
+ lvsl v11, r0, r0
+ vslb v10, v11, v10
+ MTVRD(v1,r4)
+ li r5, 16
+ vspltb v1, v1, 7
+ /* Compare 32 bytes in each loop. */
+L(continue):
+ lvx v4, 0, r3
+ lvx v5, r3, r5
+ vcmpequb v2, v0, v4
+ vcmpequb v3, v0, v5
+ vcmpequb v6, v1, v4
+ vcmpequb v7, v1, v5
+ vor v8, v2, v3
+ vor v9, v6, v7
+ vor v11, v8, v9
+ vcmpequb. v11, v0, v11
+ addi r3, r3, 32
+ blt cr6, L(continue)
+ /* One (or both) of the quadwords contains a c/null byte. */
+ addi r3, r3, -32
+#ifndef USE_AS_STRCHRNUL
+ vcmpequb. v11, v0, v9
+ blt cr6, L(no_match)
+#endif
+ /* Permute the first bit of each byte into bits 48-63. */
+ VBPERMQ(v2, v2, v10)
+ VBPERMQ(v3, v3, v10)
+ VBPERMQ(v6, v6, v10)
+ VBPERMQ(v7, v7, v10)
+ /* Shift each component into its correct position for merging. */
+#ifdef __LITTLE_ENDIAN__
+ vsldoi v3, v3, v3, 2
+ vsldoi v7, v7, v7, 2
+#else
+ vsldoi v2, v2, v2, 6
+ vsldoi v3, v3, v3, 4
+ vsldoi v6, v6, v6, 6
+ vsldoi v7, v7, v7, 4
+#endif
+
+ /* Merge the results and move to a GPR. */
+ vor v1, v3, v2
+ vor v2, v6, v7
+ vor v4, v1, v2
+ MFVRD(r5, v4)
+#ifdef __LITTLE_ENDIAN__
+ addi r6, r5, -1
+ andc r6, r6, r5
+ popcntd r6, r6
+#else
+ cntlzd r6, r5 /* Count leading zeros before the match. */
+#endif
+ add r3, r3, r6 /* Compute final length. */
+ /* Return NULL if null found before c. */
+#ifndef USE_AS_STRCHRNUL
+ lbz r4, 0(r3)
+ cmpdi cr7, r4, 0
+ beq cr7, L(no_match)
+#endif
+ blr
+
+#ifndef USE_AS_STRCHRNUL
+ .align 4
+L(no_match):
+ li r3,0
+ blr
+#endif
+
+/* We are here because strchr was called with a null byte. */
+ .align 4
+L(null_match):
+ /* r0 has a doubleword of null bytes. */
+
+ cmpb r5,r12,r0 /* Compare each byte against null bytes. */
+
+ /* Move the doublewords left and right to discard the bits that are
+ not part of the string and bring them back as zeros. */
+#ifdef __LITTLE_ENDIAN__
+ srd r5,r5,r6
+ sld r5,r5,r6
+#else
+ sld r5,r5,r6
+ srd r5,r5,r6
+#endif
+ cmpdi cr7,r5,0 /* If r10 == 0, no c or null bytes
+ have been found. */
+ bne cr7,L(done_null)
+
+ mtcrf 0x01,r8
+
+ /* Are we now aligned to a quadword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bt 28,L(loop_null)
+
+ /* Handle WORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r5,r12,r0
+ cmpdi cr7,r5,0
+ bne cr7,L(done_null)
+ b L(loop_null) /* We branch here (rather than falling through)
+ to skip the nops due to heavy alignment
+ of the loop below. */
+
+ /* Main loop to look for the end of the string. Since it's a
+ small loop (< 8 instructions), align it to 32-bytes. */
+ .p2align 5
+L(loop_null):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+ ld r12,8(r8)
+ ldu r11,16(r8)
+ cmpb r5,r12,r0
+ cmpb r10,r11,r0
+ or r6,r5,r10
+ cmpdi cr7,r6,0
+ beq cr7,L(vector1)
+
+ /* OK, one (or both) of the doublewords contains a null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a null byte. */
+
+ cmpdi cr6,r5,0
+ addi r8,r8,-8
+ bne cr6,L(done_null)
+
+ /* The null byte must be in the second doubleword. Adjust the address
+ again and move the result of cmpb to r10 so we can calculate the
+ pointer. */
+
+ mr r5,r10
+ addi r8,r8,8
+
+ /* r5 has the output of the cmpb instruction, that is, it contains
+ 0xff in the same position as the null byte in the original
+ doubleword from the string. Use that to calculate the pointer. */
+L(done_null):
+#ifdef __LITTLE_ENDIAN__
+ addi r0,r5,-1
+ andc r0,r0,r5
+ popcntd r0,r0
+#else
+ cntlzd r0,r5 /* Count leading zeros before the match. */
+#endif
+ srdi r0,r0,3 /* Convert leading zeros to bytes. */
+ add r3,r8,r0 /* Return address of the matching null byte. */
+ blr
+ .p2align 5
+L(vector1):
+ addi r3, r8, 8
+ andi. r10, r3, 31
+ bne cr0, L(loop_null)
+ vspltisb v8, -1
+ vspltisb v0, 0
+ vspltisb v10, 3
+ lvsl v11, r0, r0
+ vslb v10, v11, v10
+ li r5, 16
+L(continue1):
+ lvx v4, 0, r3
+ lvx v5, r3, r5
+ vcmpequb v2, v0, v4
+ vcmpequb v3, v0, v5
+ vor v8, v2, v3
+ vcmpequb. v11, v0, v8
+ addi r3, r3, 32
+ blt cr6, L(continue1)
+ addi r3, r3, -32
+L(end1):
+ VBPERMQ(v2, v2, v10)
+ VBPERMQ(v3, v3, v10)
+ /* Shift each component into its correct position for merging. */
+#ifdef __LITTLE_ENDIAN__
+ vsldoi v3, v3, v3, 2
+#else
+ vsldoi v2, v2, v2, 6
+ vsldoi v3, v3, v3, 4
+#endif
+
+ /* Merge the results and move to a GPR. */
+ vor v4, v3, v2
+ MFVRD(r5, v4)
+#ifdef __LITTLE_ENDIAN__
+ addi r6, r5, -1
+ andc r6, r6, r5
+ popcntd r6, r6
+#else
+ cntlzd r6, r5 /* Count leading zeros before the match. */
+#endif
+ add r3, r3, r6 /* Compute final length. */
+ blr
+END (FUNC_NAME)
+
+#ifndef USE_AS_STRCHRNUL
+weak_alias (strchr, index)
+libc_hidden_builtin_def (strchr)
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strchrnul.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strchrnul.S
new file mode 100644
index 0000000000..3bf4b275dd
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strchrnul.S
@@ -0,0 +1,23 @@
+/* Optimized strchrnul implementation for PowerPC64/POWER8.
+ Copyright (C) 2016-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define USE_AS_STRCHRNUL 1
+#include <sysdeps/powerpc/powerpc64/power8/strchr.S>
+
+weak_alias (__strchrnul,strchrnul)
+libc_hidden_builtin_def (__strchrnul)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcmp.S
new file mode 100644
index 0000000000..770484f1e1
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcmp.S
@@ -0,0 +1,247 @@
+/* Optimized strcmp implementation for PowerPC64/POWER8.
+ Copyright (C) 2015-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifndef STRCMP
+# define STRCMP strcmp
+#endif
+
+/* Implements the function
+
+ size_t [r3] strcmp (const char *s1 [r3], const char *s2 [r4])
+
+ The implementation uses unaligned doubleword access to avoid specialized
+ code paths depending of data alignment. Although recent powerpc64 uses
+ 64K as default, the page cross handling assumes minimum page size of
+ 4k. */
+
+EALIGN (STRCMP, 4, 0)
+ li r0,0
+
+ /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
+ the code:
+
+ (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
+
+ with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */
+
+ rldicl r7,r3,0,52
+ rldicl r9,r4,0,52
+ cmpldi cr7,r7,4096-16
+ bgt cr7,L(pagecross_check)
+ cmpldi cr5,r9,4096-16
+ bgt cr5,L(pagecross_check)
+
+ /* For short string up to 16 bytes, load both s1 and s2 using
+ unaligned dwords and compare. */
+ ld r8,0(r3)
+ ld r10,0(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ ld r8,8(r3)
+ ld r10,8(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ addi r7,r3,16
+ addi r4,r4,16
+
+L(align_8b):
+ /* Now it has checked for first 16 bytes, align source1 to doubleword
+ and adjust source2 address. */
+ rldicl r9,r7,0,61 /* source1 alignment to doubleword */
+ subf r4,r9,r4 /* Adjust source2 address based on source1
+ alignment. */
+ rldicr r7,r7,0,60 /* Align source1 to doubleword. */
+
+ /* At this point, source1 alignment is 0 and source2 alignment is
+ between 0 and 7. Check is source2 alignment is 0, meaning both
+ sources have the same alignment. */
+ andi. r9,r4,0x7
+ bne cr0,L(loop_diff_align)
+
+ /* If both source1 and source2 are doubleword aligned, there is no
+ need for page boundary cross checks. */
+
+ ld r8,0(r7)
+ ld r10,0(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ .align 4
+L(loop_equal_align):
+ ld r8,8(r7)
+ ld r10,8(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ ld r8,16(r7)
+ ld r10,16(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ ldu r8,24(r7)
+ ldu r10,24(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ b L(loop_equal_align)
+
+ /* A zero byte was found in r8 (s1 dword), r9 contains the cmpb
+ result and r10 the dword from s2. To code isolate the byte
+ up to end (including the '\0'), masking with 0xFF the remaining
+ ones:
+
+ #if __LITTLE_ENDIAN__
+ (__builtin_ffsl (x) - 1) = counting trailing zero bits
+ r9 = (__builtin_ffsl (r9) - 1) + 8;
+ r9 = -1UL << r9
+ #else
+ r9 = __builtin_clzl (r9) + 8;
+ r9 = -1UL >> r9
+ #endif
+ r8 = r8 | r9
+ r10 = r10 | r9 */
+
+#ifdef __LITTLE_ENDIAN__
+ nor r9,r9,r9
+L(different_nocmpb):
+ neg r3,r9
+ and r9,r9,r3
+ cntlzd r9,r9
+ subfic r9,r9,63
+#else
+ not r9,r9
+L(different_nocmpb):
+ cntlzd r9,r9
+ subfic r9,r9,56
+#endif
+ srd r3,r8,r9
+ srd r10,r10,r9
+ rldicl r10,r10,0,56
+ rldicl r3,r3,0,56
+ subf r3,r10,r3
+ extsw r3,r3
+ blr
+
+ .align 4
+L(pagecross_check):
+ subfic r9,r9,4096
+ subfic r7,r7,4096
+ cmpld cr7,r7,r9
+ bge cr7,L(pagecross)
+ mr r7,r9
+
+ /* If unaligned 16 bytes reads across a 4K page boundary, it uses
+ a simple byte a byte comparison until the page alignment for s1
+ is reached. */
+L(pagecross):
+ add r7,r3,r7
+ subf r9,r3,r7
+ mtctr r9
+
+ .align 4
+L(pagecross_loop):
+ /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2
+ and if *s1 is '\0'. */
+ lbz r9,0(r3)
+ lbz r10,0(r4)
+ addi r3,r3,1
+ addi r4,r4,1
+ cmplw cr7,r9,r10
+ cmpdi cr5,r9,r0
+ bne cr7,L(pagecross_ne)
+ beq cr5,L(pagecross_nullfound)
+ bdnz L(pagecross_loop)
+ b L(align_8b)
+
+ .align 4
+ /* The unaligned read of source2 will cross a 4K page boundary,
+ and the different byte or NULL maybe be in the remaining page
+ bytes. Since it can not use the unaligned load, the algorithm
+ reads and compares 8 bytes to keep source1 doubleword aligned. */
+L(check_source2_byte):
+ li r9,8
+ mtctr r9
+
+ .align 4
+L(check_source2_byte_loop):
+ lbz r9,0(r7)
+ lbz r10,0(r4)
+ addi r7,r7,1
+ addi r4,r4,1
+ cmplw cr7,r9,10
+ cmpdi r5,r9,0
+ bne cr7,L(pagecross_ne)
+ beq cr5,L(pagecross_nullfound)
+ bdnz L(check_source2_byte_loop)
+
+ /* If source2 is unaligned to doubleword, the code needs to check
+ on each interation if the unaligned doubleword access will cross
+ a 4k page boundary. */
+ .align 5
+L(loop_unaligned):
+ ld r8,0(r7)
+ ld r10,0(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+ addi r7,r7,8
+ addi r4,r4,8
+
+L(loop_diff_align):
+ /* Check if [src2]+8 cross a 4k page boundary:
+
+ srcin2 % PAGE_SIZE > (PAGE_SIZE - 8)
+
+ with PAGE_SIZE being 4096. */
+ rldicl r9,r4,0,52
+ cmpldi cr7,r9,4088
+ ble cr7,L(loop_unaligned)
+ b L(check_source2_byte)
+
+ .align 4
+L(pagecross_ne):
+ extsw r3,r9
+ mr r9,r10
+L(pagecross_retdiff):
+ subf r9,r9,r3
+ extsw r3,r9
+ blr
+
+ .align 4
+L(pagecross_nullfound):
+ li r3,0
+ b L(pagecross_retdiff)
+END (STRCMP)
+libc_hidden_builtin_def (strcmp)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcpy.S
new file mode 100644
index 0000000000..7f2cee4b1b
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcpy.S
@@ -0,0 +1,270 @@
+/* Optimized strcpy/stpcpy implementation for PowerPC64/POWER8.
+ Copyright (C) 2015-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STPCPY
+# ifndef STPCPY
+# define FUNC_NAME __stpcpy
+# else
+# define FUNC_NAME STPCPY
+# endif
+#else
+# ifndef STRCPY
+# define FUNC_NAME strcpy
+# else
+# define FUNC_NAME STRCPY
+# endif
+#endif /* !USE_AS_STPCPY */
+
+/* Implements the function
+
+ char * [r3] strcpy (char *dest [r3], const char *src [r4])
+
+ or
+
+ char * [r3] stpcpy (char *dest [r3], const char *src [r4])
+
+ if USE_AS_STPCPY is defined.
+
+ The implementation uses unaligned doubleword access to avoid specialized
+ code paths depending of data alignment. Although recent powerpc64 uses
+ 64K as default, the page cross handling assumes minimum page size of
+ 4k. */
+
+ .machine power7
+EALIGN (FUNC_NAME, 4, 0)
+ li r0,0 /* Doubleword with null chars to use
+ with cmpb. */
+
+ /* Check if the [src]+15 will cross a 4K page by checking if the bit
+ indicating the page size changes. Basically:
+
+ uint64_t srcin = (uint64_t)src;
+ uint64_t ob = srcin & 4096UL;
+ uint64_t nb = (srcin+15UL) & 4096UL;
+ if (ob ^ nb)
+ goto pagecross; */
+
+ addi r9,r4,15
+ xor r9,r9,r4
+ rlwinm. r9,r9,0,19,19
+ bne L(pagecross)
+
+ /* For short string (less than 16 bytes), just calculate its size as
+ strlen and issues a memcpy if null is found. */
+ mr r7,r4
+ ld r12,0(r7) /* Load doubleword from memory. */
+ cmpb r10,r12,r0 /* Check for null bytes in DWORD1. */
+ cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
+ bne cr7,L(done)
+
+ ldu r8,8(r7)
+ cmpb r10,r8,r0
+ cmpdi cr7,r10,0
+ bne cr7,L(done)
+
+ b L(loop_before)
+
+ .align 4
+L(pagecross):
+ clrrdi r7,r4,3 /* Align the address to doubleword boundary. */
+ rlwinm r6,r4,3,26,28 /* Calculate padding. */
+ li r5,-1 /* MASK = 0xffffffffffffffff. */
+ ld r12,0(r7) /* Load doubleword from memory. */
+#ifdef __LITTLE_ENDIAN__
+ sld r5,r5,r6
+#else
+ srd r5,r5,r6 /* MASK = MASK >> padding. */
+#endif
+ orc r9,r12,r5 /* Mask bits that are not part of the string. */
+ cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */
+ cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
+ bne cr7,L(done)
+
+ ldu r6,8(r7)
+ cmpb r10,r6,r0
+ cmpdi cr7,r10,0
+ bne cr7,L(done)
+
+ ld r12,0(r7)
+ cmpb r10,r12,r0
+ cmpdi cr7,r10,0
+ bne cr7,L(done)
+
+ ldu r6,8(r7)
+ cmpb r10,r6,r0
+ cmpdi cr7,r10,0
+ bne cr7,L(done)
+
+ /* We checked for 24 - x bytes, with x being the source alignment
+ (0 <= x <= 16), and no zero has been found. Start the loop
+ copy with doubleword aligned address. */
+ mr r7,r4
+ ld r12, 0(r7)
+ ldu r8, 8(r7)
+
+L(loop_before):
+ /* Save the two doublewords readed from source and align the source
+ to 16 bytes for the loop. */
+ mr r11,r3
+ std r12,0(r11)
+ std r8,8(r11)
+ addi r11,r11,16
+ rldicl r9,r4,0,60
+ subf r7,r9,r7
+ subf r11,r9,r11
+ b L(loop_start)
+
+ .align 5
+L(loop):
+ std r12, 0(r11)
+ std r6, 8(r11)
+ addi r11,r11,16
+L(loop_start):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+
+ ld r12, 8(r7)
+ ldu r6, 16(r7)
+ cmpb r10,r12,r0
+ cmpb r9,r6,r0
+ or r8,r9,r10 /* Merge everything in one doubleword. */
+ cmpdi cr7,r8,0
+ beq cr7,L(loop)
+
+
+ /* OK, one (or both) of the doublewords contains a null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a null byte. */
+
+ addi r4,r7,-8
+ cmpdi cr6,r10,0
+ addi r7,r7,-8
+ bne cr6,L(done2)
+
+ /* The null byte must be in the second doubleword. Adjust the address
+ again and move the result of cmpb to r10 so we can calculate the
+ length. */
+
+ mr r10,r9
+ addi r7,r7,8
+ b L(done2)
+
+ /* r10 has the output of the cmpb instruction, that is, it contains
+ 0xff in the same position as the null byte in the original
+ doubleword from the string. Use that to calculate the length. */
+L(done):
+ mr r11,r3
+L(done2):
+#ifdef __LITTLE_ENDIAN__
+ addi r9, r10, -1 /* Form a mask from trailing zeros. */
+ andc r9, r9, r10
+ popcntd r6, r9 /* Count the bits in the mask. */
+#else
+ cntlzd r6,r10 /* Count leading zeros before the match. */
+#endif
+ subf r5,r4,r7
+ srdi r6,r6,3 /* Convert leading/trailing zeros to bytes. */
+ add r8,r5,r6 /* Compute final length. */
+#ifdef USE_AS_STPCPY
+ /* stpcpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r8
+#endif
+ addi r8,r8,1 /* Final '/0'. */
+
+ cmpldi cr6,r8,8
+ mtocrf 0x01,r8
+ ble cr6,L(copy_LE_8)
+
+ cmpldi cr1,r8,16
+ blt cr1,8f
+
+ /* Handle copies of 0~31 bytes. */
+ .align 4
+L(copy_LT_32):
+ /* At least 6 bytes to go. */
+ blt cr1,8f
+
+ /* Copy 16 bytes. */
+ ld r6,0(r4)
+ ld r8,8(r4)
+ addi r4,r4,16
+ std r6,0(r11)
+ std r8,8(r11)
+ addi r11,r11,16
+8: /* Copy 8 bytes. */
+ bf 28,L(tail4)
+ ld r6,0(r4)
+ addi r4,r4,8
+ std r6,0(r11)
+ addi r11,r11,8
+
+ .align 4
+/* Copies 4~7 bytes. */
+L(tail4):
+ bf 29,L(tail2)
+ lwz r6,0(r4)
+ stw r6,0(r11)
+ bf 30,L(tail5)
+ lhz r7,4(r4)
+ sth r7,4(r11)
+ bflr 31
+ lbz r8,6(r4)
+ stb r8,6(r11)
+ blr
+
+ .align 4
+/* Copies 2~3 bytes. */
+L(tail2):
+ bf 30,1f
+ lhz r6,0(r4)
+ sth r6,0(r11)
+ bflr 31
+ lbz r7,2(r4)
+ stb r7,2(r11)
+ blr
+
+ .align 4
+L(tail5):
+ bf 31,1f
+ lbz r6,4(r4)
+ stb r6,4(r11)
+ blr
+
+ .align 4
+1:
+ bflr 31
+ lbz r6,0(r4)
+ stb r6,0(r11)
+ blr
+
+/* Handles copies of 0~8 bytes. */
+ .align 4
+L(copy_LE_8):
+ bne cr6,L(tail4)
+ ld r6,0(r4)
+ std r6,0(r11)
+ blr
+END (FUNC_NAME)
+
+#ifndef USE_AS_STPCPY
+libc_hidden_builtin_def (strcpy)
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcspn.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcspn.S
new file mode 100644
index 0000000000..c9a7a2e3c3
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcspn.S
@@ -0,0 +1,20 @@
+/* Optimized strcspn implementation for PowerPC64/POWER8.
+ Copyright (C) 2016-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define USE_AS_STRCSPN 1
+#include <sysdeps/powerpc/powerpc64/power8/strspn.S>
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strlen.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strlen.S
new file mode 100644
index 0000000000..8f4a1fc1dc
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strlen.S
@@ -0,0 +1,301 @@
+/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized
+ loop.
+ Copyright (C) 2016-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* TODO: change these to the actual instructions when the minimum required
+ binutils allows it. */
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t,a,b) .long (0x1000054c \
+ | ((t)<<(32-11)) \
+ | ((a)<<(32-16)) \
+ | ((b)<<(32-21)) )
+
+/* int [r3] strlen (char *s [r3]) */
+
+#ifndef STRLEN
+# define STRLEN strlen
+#endif
+
+/* TODO: change this to .machine power8 when the minimum required binutils
+ allows it. */
+ .machine power7
+EALIGN (STRLEN, 4, 0)
+ CALL_MCOUNT 1
+ dcbt 0,r3
+ clrrdi r4,r3,3 /* Align the address to doubleword boundary. */
+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
+ li r0,0 /* Doubleword with null chars to use
+ with cmpb. */
+ li r5,-1 /* MASK = 0xffffffffffffffff. */
+ ld r12,0(r4) /* Load doubleword from memory. */
+#ifdef __LITTLE_ENDIAN__
+ sld r5,r5,r6
+#else
+ srd r5,r5,r6 /* MASK = MASK >> padding. */
+#endif
+ orc r9,r12,r5 /* Mask bits that are not part of the string. */
+ cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */
+ cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
+ bne cr7,L(done)
+
+ /* For shorter strings (< 64 bytes), we will not use vector registers,
+ as the overhead isn't worth it. So, let's use GPRs instead. This
+ will be done the same way as we do in the POWER7 implementation.
+ Let's see if we are aligned to a quadword boundary. If so, we can
+ jump to the first (non-vectorized) loop. Otherwise, we have to
+ handle the next DWORD first. */
+ mtcrf 0x01,r4
+ mr r9,r4
+ addi r9,r9,8
+ bt 28,L(align64)
+
+ /* Handle the next 8 bytes so we are aligned to a quadword
+ boundary. */
+ ldu r5,8(r4)
+ cmpb r10,r5,r0
+ cmpdi cr7,r10,0
+ addi r9,r9,8
+ bne cr7,L(done)
+
+L(align64):
+ /* Proceed to the old (POWER7) implementation, checking two doublewords
+ per iteraction. For the first 56 bytes, we will just check for null
+ characters. After that, we will also check if we are 64-byte aligned
+ so we can jump to the vectorized implementation. We will unroll
+ these loops to avoid excessive branching. */
+ ld r6,8(r4)
+ ldu r5,16(r4)
+ cmpb r10,r6,r0
+ cmpb r11,r5,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ addi r9,r9,16
+ bne cr7,L(dword_zero)
+
+ ld r6,8(r4)
+ ldu r5,16(r4)
+ cmpb r10,r6,r0
+ cmpb r11,r5,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ addi r9,r9,16
+ bne cr7,L(dword_zero)
+
+ ld r6,8(r4)
+ ldu r5,16(r4)
+ cmpb r10,r6,r0
+ cmpb r11,r5,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ addi r9,r9,16
+ bne cr7,L(dword_zero)
+
+ /* Are we 64-byte aligned? If so, jump to the vectorized loop.
+ Note: aligning to 64-byte will necessarily slow down performance for
+ strings around 64 bytes in length due to the extra comparisons
+ required to check alignment for the vectorized loop. This is a
+ necessary tradeoff we are willing to take in order to speed up the
+ calculation for larger strings. */
+ andi. r10,r9,63
+ beq cr0,L(preloop)
+ ld r6,8(r4)
+ ldu r5,16(r4)
+ cmpb r10,r6,r0
+ cmpb r11,r5,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ addi r9,r9,16
+ bne cr7,L(dword_zero)
+
+ andi. r10,r9,63
+ beq cr0,L(preloop)
+ ld r6,8(r4)
+ ldu r5,16(r4)
+ cmpb r10,r6,r0
+ cmpb r11,r5,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ addi r9,r9,16
+ bne cr7,L(dword_zero)
+
+ andi. r10,r9,63
+ beq cr0,L(preloop)
+ ld r6,8(r4)
+ ldu r5,16(r4)
+ cmpb r10,r6,r0
+ cmpb r11,r5,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ addi r9,r9,16
+ bne cr7,L(dword_zero)
+
+ andi. r10,r9,63
+ beq cr0,L(preloop)
+ ld r6,8(r4)
+ ldu r5,16(r4)
+ cmpb r10,r6,r0
+ cmpb r11,r5,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ addi r9,r9,16
+
+ /* At this point, we are necessarily 64-byte aligned. If no zeroes were
+ found, jump to the vectorized loop. */
+ beq cr7,L(preloop)
+
+L(dword_zero):
+ /* OK, one (or both) of the doublewords contains a null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a null byte. */
+
+ cmpdi cr6,r10,0
+ addi r4,r4,-8
+ bne cr6,L(done)
+
+ /* The null byte must be in the second doubleword. Adjust the address
+ again and move the result of cmpb to r10 so we can calculate the
+ length. */
+
+ mr r10,r11
+ addi r4,r4,8
+
+ /* If the null byte was found in the non-vectorized code, compute the
+ final length. r10 has the output of the cmpb instruction, that is,
+ it contains 0xff in the same position as the null byte in the
+ original doubleword from the string. Use that to calculate the
+ length. */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+ addi r9, r10,-1 /* Form a mask from trailing zeros. */
+ andc r9, r9,r10
+ popcntd r0, r9 /* Count the bits in the mask. */
+#else
+ cntlzd r0,r10 /* Count leading zeros before the match. */
+#endif
+ subf r5,r3,r4
+ srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
+ add r3,r5,r0 /* Compute final length. */
+ blr
+
+ /* Vectorized implementation starts here. */
+ .p2align 4
+L(preloop):
+ /* Set up for the loop. */
+ mr r4,r9
+ li r7, 16 /* Load required offsets. */
+ li r8, 32
+ li r9, 48
+ li r12, 8
+ vxor v0,v0,v0 /* VR with null chars to use with
+ vcmpequb. */
+
+ /* Main loop to look for the end of the string. We will read in
+ 64-byte chunks. Align it to 32 bytes and unroll it 3 times to
+ leverage the icache performance. */
+ .p2align 5
+L(loop):
+ lvx v1,r4,r0 /* Load 4 quadwords. */
+ lvx v2,r4,r7
+ lvx v3,r4,r8
+ lvx v4,r4,r9
+ vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
+ vminub v6,v3,v4
+ vminub v7,v5,v6
+ vcmpequb. v7,v7,v0 /* Check for NULLs. */
+ addi r4,r4,64 /* Adjust address for the next iteration. */
+ bne cr6,L(vmx_zero)
+
+ lvx v1,r4,r0 /* Load 4 quadwords. */
+ lvx v2,r4,r7
+ lvx v3,r4,r8
+ lvx v4,r4,r9
+ vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
+ vminub v6,v3,v4
+ vminub v7,v5,v6
+ vcmpequb. v7,v7,v0 /* Check for NULLs. */
+ addi r4,r4,64 /* Adjust address for the next iteration. */
+ bne cr6,L(vmx_zero)
+
+ lvx v1,r4,r0 /* Load 4 quadwords. */
+ lvx v2,r4,r7
+ lvx v3,r4,r8
+ lvx v4,r4,r9
+ vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
+ vminub v6,v3,v4
+ vminub v7,v5,v6
+ vcmpequb. v7,v7,v0 /* Check for NULLs. */
+ addi r4,r4,64 /* Adjust address for the next iteration. */
+ beq cr6,L(loop)
+
+L(vmx_zero):
+ /* OK, we found a null byte. Let's look for it in the current 64-byte
+ block and mark it in its corresponding VR. */
+ vcmpequb v1,v1,v0
+ vcmpequb v2,v2,v0
+ vcmpequb v3,v3,v0
+ vcmpequb v4,v4,v0
+
+ /* We will now 'compress' the result into a single doubleword, so it
+ can be moved to a GPR for the final calculation. First, we
+ generate an appropriate mask for vbpermq, so we can permute bits into
+ the first halfword. */
+ vspltisb v10,3
+ lvsl v11,r0,r0
+ vslb v10,v11,v10
+
+ /* Permute the first bit of each byte into bits 48-63. */
+ VBPERMQ(v1,v1,v10)
+ VBPERMQ(v2,v2,v10)
+ VBPERMQ(v3,v3,v10)
+ VBPERMQ(v4,v4,v10)
+
+ /* Shift each component into its correct position for merging. */
+#ifdef __LITTLE_ENDIAN__
+ vsldoi v2,v2,v2,2
+ vsldoi v3,v3,v3,4
+ vsldoi v4,v4,v4,6
+#else
+ vsldoi v1,v1,v1,6
+ vsldoi v2,v2,v2,4
+ vsldoi v3,v3,v3,2
+#endif
+
+ /* Merge the results and move to a GPR. */
+ vor v1,v2,v1
+ vor v2,v3,v4
+ vor v4,v1,v2
+ MFVRD(r10,v4)
+
+ /* Adjust address to the begninning of the current 64-byte block. */
+ addi r4,r4,-64
+
+#ifdef __LITTLE_ENDIAN__
+ addi r9, r10,-1 /* Form a mask from trailing zeros. */
+ andc r9, r9,r10
+ popcntd r0, r9 /* Count the bits in the mask. */
+#else
+ cntlzd r0,r10 /* Count leading zeros before the match. */
+#endif
+ subf r5,r3,r4
+ add r3,r5,r0 /* Compute final length. */
+ blr
+
+END (STRLEN)
+libc_hidden_builtin_def (strlen)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncase.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncase.S
new file mode 100644
index 0000000000..32e09e4d94
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncase.S
@@ -0,0 +1,20 @@
+/* Optimized strncasecmp implementation for POWER8.
+ Copyright (C) 2016-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define USE_AS_STRNCASECMP 1
+#include <sysdeps/powerpc/powerpc64/power8/strcasecmp.S>
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncmp.S
new file mode 100644
index 0000000000..3d8df90538
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncmp.S
@@ -0,0 +1,327 @@
+/* Optimized strncmp implementation for PowerPC64/POWER8.
+ Copyright (C) 2015-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifndef STRNCMP
+# define STRNCMP strncmp
+#endif
+
+/* Implements the function
+
+ int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n)
+
+ The implementation uses unaligned doubleword access to avoid specialized
+ code paths depending of data alignment. Although recent powerpc64 uses
+ 64K as default, the page cross handling assumes minimum page size of
+ 4k. */
+
+ .machine power7
+EALIGN (STRNCMP, 4, 0)
+ /* Check if size is 0. */
+ mr. r10,r5
+ beq cr0,L(ret0)
+
+ /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
+ the code:
+
+ (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
+
+ with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */
+ rldicl r8,r3,0,52
+ cmpldi cr7,r8,4096-16
+ bgt cr7,L(pagecross)
+ rldicl r9,r4,0,52
+ cmpldi cr7,r9,4096-16
+ bgt cr7,L(pagecross)
+
+ /* For short string up to 16 bytes, load both s1 and s2 using
+ unaligned dwords and compare. */
+ ld r7,0(r3)
+ ld r9,0(r4)
+ li r8,0
+ cmpb r8,r7,r8
+ cmpb r6,r7,r9
+ orc. r8,r8,r6
+ bne cr0,L(different1)
+
+ /* If the string compared are equal, but size is less or equal
+ to 8, return 0. */
+ cmpldi cr7,r10,8
+ li r9,0
+ ble cr7,L(ret1)
+ addi r5,r10,-8
+
+ ld r7,8(r3)
+ ld r9,8(r4)
+ cmpb r8,r7,r8
+ cmpb r6,r7,r9
+ orc. r8,r8,r6
+ bne cr0,L(different0)
+
+ cmpldi cr7,r5,8
+ mr r9,r8
+ ble cr7,L(ret1)
+
+ /* Update pointers and size. */
+ addi r10,r10,-16
+ addi r3,r3,16
+ addi r4,r4,16
+
+ /* Now it has checked for first 16 bytes, align source1 to doubleword
+ and adjust source2 address. */
+L(align_8b):
+ rldicl r5,r3,0,61
+ rldicr r3,r3,0,60
+ subf r4,r5,r4
+ add r10,r10,r5
+
+ /* At this point, source1 alignment is 0 and source2 alignment is
+ between 0 and 7. Check is source2 alignment is 0, meaning both
+ sources have the same alignment. */
+ andi. r8,r4,0x7
+ beq cr0,L(loop_eq_align_0)
+
+ li r5,0
+ b L(loop_ne_align_1)
+
+ /* If source2 is unaligned to doubleword, the code needs to check
+ on each interation if the unaligned doubleword access will cross
+ a 4k page boundary. */
+ .align 4
+L(loop_ne_align_0):
+ ld r7,0(r3)
+ ld r9,0(r4)
+ cmpb r8,r7,r5
+ cmpb r6,r7,r9
+ orc. r8,r8,r6
+ bne cr0,L(different1)
+
+ cmpldi cr7,r10,8
+ ble cr7,L(ret0)
+ addi r10,r10,-8
+ addi r3,r3,8
+ addi r4,r4,8
+L(loop_ne_align_1):
+ rldicl r9,r4,0,52
+ cmpldi r7,r9,4088
+ ble cr7,L(loop_ne_align_0)
+ cmpdi cr7,r10,0
+ beq cr7,L(ret0)
+
+ lbz r9,0(r3)
+ lbz r8,0(r4)
+ cmplw cr7,r9,r8
+ bne cr7,L(byte_ne_4)
+ cmpdi cr7,r9,0
+ beq cr7,L(size_reached_0)
+
+ li r9,r7
+ addi r8,r3,1
+ mtctr r9
+ addi r4,r4,1
+ addi r10,r10,-1
+ addi r3,r3,8
+
+ /* The unaligned read of source2 will cross a 4K page boundary,
+ and the different byte or NULL maybe be in the remaining page
+ bytes. Since it can not use the unaligned load the algorithm
+ reads and compares 8 bytes to keep source1 doubleword aligned. */
+ .align 4
+L(loop_ne_align_byte):
+ cmpdi cr7,r10,0
+ addi r10,r10,-1
+ beq cr7,L(ret0)
+ lbz r9,0(r8)
+ lbz r7,0(r4)
+ addi r8,r8,1
+ addi r4,r4,1
+ cmplw cr7,r9,r7
+ cmpdi cr5,r9,0
+ bne cr7,L(size_reached_2)
+ beq cr5,L(size_reached_0)
+ bdnz L(loop_ne_align_byte)
+
+ cmpdi cr7,r10,0
+ bne+ cr7,L(loop_ne_align_0)
+
+ .align 4
+L(ret0):
+ li r9,0
+L(ret1):
+ mr r3,r9
+ blr
+
+ /* The code now check if r8 and r10 are different by issuing a
+ cmpb and shift the result based on its output:
+
+ #ifdef __LITTLE_ENDIAN__
+ leadzero = (__builtin_ffsl (z1) - 1);
+ leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
+ r1 = (r1 >> leadzero) & 0xFFUL;
+ r2 = (r2 >> leadzero) & 0xFFUL;
+ #else
+ leadzero = __builtin_clzl (z1);
+ leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
+ r1 = (r1 >> (56 - leadzero)) & 0xFFUL;
+ r2 = (r2 >> (56 - leadzero)) & 0xFFUL;
+ #endif
+ return r1 - r2; */
+
+ .align 4
+L(different0):
+ mr r10,r5
+#ifdef __LITTLE_ENDIAN__
+L(different1):
+ neg r11,r8
+ sldi r10,r10,3
+ and r8,r11,r8
+ addi r10,r10,-8
+ cntlzd r8,r8
+ subfic r8,r8,63
+ extsw r8,r8
+ cmpld cr7,r8,r10
+ ble cr7,L(different2)
+ mr r8,r10
+L(different2):
+ extsw r8,r8
+#else
+L(different1):
+ addi r10,r10,-1
+ cntlzd r8,r8
+ sldi r10,r10,3
+ cmpld cr7,r8,r10
+ blt cr7,L(different2)
+ mr r8,r10
+L(different2):
+ subfic r8,r8,56
+#endif
+ srd r7,r7,r8
+ srd r9,r9,r8
+ rldicl r3,r7,0,56
+ rldicl r9,r9,0,56
+ subf r9,r9,3
+ extsw r9,r9
+ mr r3,r9
+ blr
+
+ /* If unaligned 16 bytes reads across a 4K page boundary, it uses
+ a simple byte a byte comparison until the page alignment for s1
+ is reached. */
+ .align 4
+L(pagecross):
+ lbz r7,0(r3)
+ lbz r9,0(r4)
+ subfic r8,r8,4095
+ cmplw cr7,r9,r7
+ bne cr7,L(byte_ne_3)
+ cmpdi cr7,r9,0
+ beq cr7,L(byte_ne_0)
+ addi r10,r10,-1
+ subf r7,r8,r10
+ subf r9,r7,r10
+ addi r9,r9,1
+ mtctr r9
+ b L(pagecross_loop1)
+
+ .align 4
+L(pagecross_loop0):
+ beq cr7,L(ret0)
+ lbz r9,0(r3)
+ lbz r8,0(r4)
+ addi r10,r10,-1
+ cmplw cr7,r9,r8
+ cmpdi cr5,r9,0
+ bne r7,L(byte_ne_2)
+ beq r5,L(byte_ne_0)
+L(pagecross_loop1):
+ cmpdi cr7,r10,0
+ addi r3,r3,1
+ addi r4,r4,1
+ bdnz L(pagecross_loop0)
+ cmpdi cr7,r7,0
+ li r9,0
+ bne+ cr7,L(align_8b)
+ b L(ret1)
+
+ /* If both source1 and source2 are doubleword aligned, there is no
+ need for page boundary cross checks. */
+ .align 4
+L(loop_eq_align_0):
+ ld r7,0(r3)
+ ld r9,0(r4)
+ cmpb r8,r7,r8
+ cmpb r6,r7,r9
+ orc. r8,r8,r6
+ bne cr0,L(different1)
+
+ cmpldi cr7,r10,8
+ ble cr7,L(ret0)
+ addi r9,r10,-9
+
+ li r5,0
+ srdi r9,r9,3
+ addi r9,r9,1
+ mtctr r9
+ b L(loop_eq_align_2)
+
+ .align 4
+L(loop_eq_align_1):
+ bdz L(ret0)
+L(loop_eq_align_2):
+ ldu r7,8(r3)
+ addi r10,r10,-8
+ ldu r9,8(r4)
+ cmpb r8,r7,r5
+ cmpb r6,r7,r9
+ orc. r8,r8,r6
+ beq cr0,L(loop_eq_align_1)
+ b L(different1)
+
+ .align 4
+L(byte_ne_0):
+ li r7,0
+L(byte_ne_1):
+ subf r9,r9,r7
+ extsw r9,r9
+ b L(ret1)
+
+ .align 4
+L(byte_ne_2):
+ extsw r7,r9
+ mr r9,r8
+ b L(byte_ne_1)
+L(size_reached_0):
+ li r10,0
+L(size_reached_1):
+ subf r9,r9,r10
+ extsw r9,r9
+ b L(ret1)
+L(size_reached_2):
+ extsw r10,r9
+ mr r9,r7
+ b L(size_reached_1)
+L(byte_ne_3):
+ extsw r7,r7
+ b L(byte_ne_1)
+L(byte_ne_4):
+ extsw r10,r9
+ mr r9,r8
+ b L(size_reached_1)
+END(STRNCMP)
+libc_hidden_builtin_def(strncmp)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncpy.S
new file mode 100644
index 0000000000..6d40f30ff7
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncpy.S
@@ -0,0 +1,465 @@
+/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8.
+ Copyright (C) 2015-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STPNCPY
+# ifndef STPNCPY
+# define FUNC_NAME __stpncpy
+# else
+# define FUNC_NAME STPNCPY
+# endif
+#else
+# ifndef STRNCPY
+# define FUNC_NAME strncpy
+# else
+# define FUNC_NAME STRNCPY
+# endif
+#endif /* !USE_AS_STPNCPY */
+
+#ifndef MEMSET
+/* For builds without IFUNC support, local calls should be made to internal
+ GLIBC symbol (created by libc_hidden_builtin_def). */
+# ifdef SHARED
+# define MEMSET __GI_memset
+# else
+# define MEMSET memset
+# endif
+#endif
+
+#define FRAMESIZE (FRAME_MIN_SIZE+48)
+
+/* Implements the function
+
+ char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+ or
+
+ char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+ if USE_AS_STPCPY is defined.
+
+ The implementation uses unaligned doubleword access to avoid specialized
+ code paths depending of data alignment. Although recent powerpc64 uses
+ 64K as default, the page cross handling assumes minimum page size of
+ 4k. */
+
+ .machine power7
+EALIGN (FUNC_NAME, 4, 0)
+
+ /* Check if the [src]+15 will cross a 4K page by checking if the bit
+ indicating the page size changes. Basically:
+
+ uint64_t srcin = (uint64_t)src;
+ uint64_t ob = srcin & 4096UL;
+ uint64_t nb = (srcin+15UL) & 4096UL;
+ if (ob ^ nb)
+ goto pagecross; */
+
+ addi r10,r4,16
+ rlwinm r9,r4,0,19,19
+
+ /* Save some non-volatile registers on the stack. */
+ std r26,-48(r1)
+ std r27,-40(r1)
+
+ rlwinm r8,r10,0,19,19
+
+ std r28,-32(r1)
+ std r29,-24(r1)
+
+ cmpld cr7,r9,r8
+
+ std r30,-16(r1)
+ std r31,-8(r1)
+
+ /* Update CFI. */
+ cfi_offset(r26, -48)
+ cfi_offset(r27, -40)
+ cfi_offset(r28, -32)
+ cfi_offset(r29, -24)
+ cfi_offset(r30, -16)
+ cfi_offset(r31, -8)
+
+ beq cr7,L(unaligned_lt_16)
+ rldicl r9,r4,0,61
+ subfic r8,r9,8
+ cmpld cr7,r5,r8
+ bgt cr7,L(pagecross)
+
+ /* At this points there is 1 to 15 bytes to check and write. Since it could
+ be either from first unaligned 16 bytes access or from bulk copy, the code
+ uses an unrolled byte read/write instead of trying to analyze the cmpb
+ results. */
+L(short_path):
+ mr r9,r3
+L(short_path_1):
+ /* Return if there are no more bytes to be written. */
+ cmpdi cr7,r5,0
+ beq cr7,L(short_path_loop_end_1)
+L(short_path_2):
+ /* Copy one char from src (r4) and write it to dest (r9). If it is the
+ end-of-string, start the null padding. Continue, otherwise. */
+ lbz r10,0(r4)
+ cmpdi cr7,r10,0
+ stb r10,0(r9)
+ beq cr7,L(zero_pad_start_1)
+ /* If there are no more bytes to be written, return. */
+ cmpdi cr0,r5,1
+ addi r8,r9,1
+ addi r6,r5,-1
+ beq cr0,L(short_path_loop_end_0)
+ /* Copy another char from src (r4) to dest (r9). Check again if it is
+ the end-of-string. If so, start the null padding. */
+ lbz r10,1(r4)
+ cmpdi cr7,r10,0
+ stb r10,1(r9)
+ beq cr7,L(zero_pad_start_prepare_1)
+ /* Eagerly decrement r5 by 3, which is the number of bytes already
+ written, plus one write that will be performed later on. */
+ addi r10,r5,-3
+ b L(short_path_loop_1)
+
+ .align 4
+L(short_path_loop):
+ /* At this point, the induction variable, r5, as well as the pointers
+ to dest and src (r9 and r4, respectivelly) have been updated.
+
+ Note: The registers r7 and r10 are induction variables derived from
+ r5. They are used to determine if the total number of writes has
+ been reached at every other write.
+
+ Copy one char from src (r4) and write it to dest (r9). If it is the
+ end-of-string, start the null padding. Continue, otherwise. */
+ lbz r8,0(r4)
+ addi r7,r10,-2
+ cmpdi cr5,r8,0
+ stb r8,0(r9)
+ beq cr5,L(zero_pad_start_1)
+ beq cr7,L(short_path_loop_end_0)
+ /* Copy another char from src (r4) to dest (r9). Check again if it is
+ the end-of-string. If so, start the null padding. */
+ lbz r8,1(r4)
+ cmpdi cr7,r8,0
+ stb r8,1(r9)
+ beq cr7,L(zero_pad_start)
+ mr r10,r7
+L(short_path_loop_1):
+ /* This block is reached after two chars have been already written to
+ dest. Nevertheless, r5 (the induction variable), r9 (the pointer to
+ dest), and r4 (the pointer to src) have not yet been updated.
+
+ At this point:
+ r5 holds the count of bytes yet to be written plus 2.
+ r9 points to the last two chars that were already written to dest.
+ r4 points to the last two chars that were already copied from src.
+
+ The algorithm continues by decrementing r5, the induction variable,
+ so that it reflects the last two writes. The pointers to dest (r9)
+ and to src (r4) are increment by two, for the same reason.
+
+ Note: Register r10 is another induction variable, derived from r5,
+ which determines if the total number of writes has been reached. */
+ addic. r5,r5,-2
+ addi r9,r9,2
+ cmpdi cr7,r10,0 /* Eagerly check if the next write is the last. */
+ addi r4,r4,2
+ addi r6,r9,1
+ bne cr0,L(short_path_loop) /* Check if the total number of writes
+ has been reached at every other
+ write. */
+#ifdef USE_AS_STPNCPY
+ mr r3,r9
+ b L(short_path_loop_end)
+#endif
+
+L(short_path_loop_end_0):
+#ifdef USE_AS_STPNCPY
+ addi r3,r9,1
+ b L(short_path_loop_end)
+#endif
+L(short_path_loop_end_1):
+#ifdef USE_AS_STPNCPY
+ mr r3,r9
+#endif
+L(short_path_loop_end):
+ /* Restore non-volatile registers. */
+ ld r26,-48(r1)
+ ld r27,-40(r1)
+ ld r28,-32(r1)
+ ld r29,-24(r1)
+ ld r30,-16(r1)
+ ld r31,-8(r1)
+ blr
+
+ /* This code pads the remainder of dest with NULL bytes. The algorithm
+ calculates the remaining size and calls memset. */
+ .align 4
+L(zero_pad_start):
+ mr r5,r10
+ mr r9,r6
+L(zero_pad_start_1):
+ /* At this point:
+ - r5 holds the number of bytes that still have to be written to
+ dest.
+ - r9 points to the position, in dest, where the first null byte
+ will be written.
+ The above statements are true both when control reaches this label
+ from a branch or when falling through the previous lines. */
+#ifndef USE_AS_STPNCPY
+ mr r30,r3 /* Save the return value of strncpy. */
+#endif
+ /* Prepare the call to memset. */
+ mr r3,r9 /* Pointer to the area to be zero-filled. */
+ li r4,0 /* Byte to be written (zero). */
+
+ /* We delayed the creation of the stack frame, as well as the saving of
+ the link register, because only at this point, we are sure that
+ doing so is actually needed. */
+
+ /* Save the link register. */
+ mflr r0
+ std r0,16(r1)
+ cfi_offset(lr, 16)
+
+ /* Create the stack frame. */
+ stdu r1,-FRAMESIZE(r1)
+ cfi_adjust_cfa_offset(FRAMESIZE)
+
+ bl MEMSET
+ nop
+
+ /* Restore the stack frame. */
+ addi r1,r1,FRAMESIZE
+ cfi_adjust_cfa_offset(-FRAMESIZE)
+ /* Restore the link register. */
+ ld r0,16(r1)
+ mtlr r0
+
+#ifndef USE_AS_STPNCPY
+ mr r3,r30 /* Restore the return value of strncpy, i.e.:
+ dest. For stpncpy, the return value is the
+ same as return value of memset. */
+#endif
+
+ /* Restore non-volatile registers and return. */
+ ld r26,-48(r1)
+ ld r27,-40(r1)
+ ld r28,-32(r1)
+ ld r29,-24(r1)
+ ld r30,-16(r1)
+ ld r31,-8(r1)
+ blr
+
+ /* The common case where [src]+16 will not cross a 4K page boundary.
+ In this case the code fast check the first 16 bytes by using doubleword
+ read/compares and update destiny if neither total size or null byte
+ is found in destiny. */
+ .align 4
+L(unaligned_lt_16):
+ cmpldi cr7,r5,7
+ ble cr7,L(short_path)
+ ld r7,0(r4)
+ li r8,0
+ cmpb r8,r7,r8
+ cmpdi cr7,r8,0
+ bne cr7,L(short_path_prepare_2)
+ addi r6,r5,-8
+ std r7,0(r3)
+ addi r9,r3,8
+ cmpldi cr7,r6,7
+ addi r7,r4,8
+ ble cr7,L(short_path_prepare_1_1)
+ ld r4,8(r4)
+ cmpb r8,r4,r8
+ cmpdi cr7,r8,0
+ bne cr7,L(short_path_prepare_2_1)
+ std r4,8(r3)
+ addi r29,r3,16
+ addi r5,r5,-16
+ /* Neither the null byte was found or total length was reached,
+ align to 16 bytes and issue a bulk copy/compare. */
+ b L(align_to_16b)
+
+ /* In the case of 4k page boundary cross, the algorithm first align
+ the address to a doubleword, calculate a mask based on alignment
+ to ignore the bytes and continue using doubleword. */
+ .align 4
+L(pagecross):
+ rldicr r11,r4,0,59 /* Align the address to 8 bytes boundary. */
+ li r6,-1 /* MASK = 0xffffffffffffffffUL. */
+ sldi r9,r9,3 /* Calculate padding. */
+ ld r7,0(r11) /* Load doubleword from memory. */
+#ifdef __LITTLE_ENDIAN__
+ sld r9,r6,r9 /* MASK = MASK << padding. */
+#else
+ srd r9,r6,r9 /* MASK = MASK >> padding. */
+#endif
+ orc r9,r7,r9 /* Mask bits that are not part of the
+ string. */
+ li r7,0
+ cmpb r9,r9,r7 /* Check for null bytes in DWORD1. */
+ cmpdi cr7,r9,0
+ bne cr7,L(short_path_prepare_2)
+ subf r8,r8,r5 /* Adjust total length. */
+ cmpldi cr7,r8,8 /* Check if length was reached. */
+ ble cr7,L(short_path_prepare_2)
+
+ /* For next checks we have aligned address, so we check for more
+ three doublewords to make sure we can read 16 unaligned bytes
+ to start the bulk copy with 16 aligned addresses. */
+ ld r7,8(r11)
+ cmpb r9,r7,r9
+ cmpdi cr7,r9,0
+ bne cr7,L(short_path_prepare_2)
+ addi r7,r8,-8
+ cmpldi cr7,r7,8
+ ble cr7,L(short_path_prepare_2)
+ ld r7,16(r11)
+ cmpb r9,r7,r9
+ cmpdi cr7,r9,0
+ bne cr7,L(short_path_prepare_2)
+ addi r8,r8,-16
+ cmpldi cr7,r8,8
+ ble cr7,L(short_path_prepare_2)
+ ld r8,24(r11)
+ cmpb r9,r8,r9
+ cmpdi cr7,r9,0
+ bne cr7,L(short_path_prepare_2)
+
+ /* No null byte found in the 32 bytes readed and length not reached,
+ read source again using unaligned loads and store them. */
+ ld r9,0(r4)
+ addi r29,r3,16
+ addi r5,r5,-16
+ std r9,0(r3)
+ ld r9,8(r4)
+ std r9,8(r3)
+
+ /* Align source to 16 bytes and adjust destiny and size. */
+L(align_to_16b):
+ rldicl r9,r10,0,60
+ rldicr r28,r10,0,59
+ add r12,r5,r9
+ subf r29,r9,r29
+
+ /* The bulk read/compare/copy loads two doublewords, compare and merge
+ in a single register for speed. This is an attempt to speed up the
+ null-checking process for bigger strings. */
+
+ cmpldi cr7,r12,15
+ ble cr7,L(short_path_prepare_1_2)
+
+ /* Main loop for large sizes, unrolled 2 times to get better use of
+ pipeline. */
+ ld r8,0(28)
+ ld r10,8(28)
+ li r9,0
+ cmpb r7,r8,r9
+ cmpb r9,r10,r9
+ or. r6,r9,r7
+ bne cr0,L(short_path_prepare_2_3)
+ addi r5,r12,-16
+ addi r4,r28,16
+ std r8,0(r29)
+ std r10,8(r29)
+ cmpldi cr7,r5,15
+ addi r9,r29,16
+ ble cr7,L(short_path_1)
+ mr r11,r28
+ mr r6,r29
+ li r30,0
+ subfic r26,r4,48
+ subfic r27,r9,48
+
+ b L(loop_16b)
+
+ .align 4
+L(loop_start):
+ ld r31,0(r11)
+ ld r10,8(r11)
+ cmpb r0,r31,r7
+ cmpb r8,r10,r7
+ or. r7,r0,r8
+ addi r5,r5,-32
+ cmpldi cr7,r5,15
+ add r4,r4,r26
+ add r9,r9,r27
+ bne cr0,L(short_path_prepare_2_2)
+ add r4,r28,r4
+ std r31,0(r6)
+ add r9,r29,r9
+ std r10,8(r6)
+ ble cr7,L(short_path_1)
+
+L(loop_16b):
+ ld r10,16(r11)
+ ld r0,24(r11)
+ cmpb r8,r10,r30
+ cmpb r7,r0,r30
+ or. r7,r8,r7
+ addi r12,r12,-32
+ cmpldi cr7,r12,15
+ addi r11,r11,32
+ bne cr0,L(short_path_2)
+ std r10,16(r6)
+ addi r6,r6,32
+ std r0,-8(r6)
+ bgt cr7,L(loop_start)
+
+ mr r5,r12
+ mr r4,r11
+ mr r9,r6
+ b L(short_path_1)
+
+ .align 4
+L(short_path_prepare_1_1):
+ mr r5,r6
+ mr r4,r7
+ b L(short_path_1)
+L(short_path_prepare_1_2):
+ mr r5,r12
+ mr r4,r28
+ mr r9,r29
+ b L(short_path_1)
+L(short_path_prepare_2):
+ mr r9,r3
+ b L(short_path_2)
+L(short_path_prepare_2_1):
+ mr r5,r6
+ mr r4,r7
+ b L(short_path_2)
+L(short_path_prepare_2_2):
+ mr r5,r12
+ mr r4,r11
+ mr r9,r6
+ b L(short_path_2)
+L(short_path_prepare_2_3):
+ mr r5,r12
+ mr r4,r28
+ mr r9,r29
+ b L(short_path_2)
+L(zero_pad_start_prepare_1):
+ mr r5,r6
+ mr r9,r8
+ b L(zero_pad_start_1)
+END (FUNC_NAME)
+
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strnlen.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strnlen.S
new file mode 100644
index 0000000000..3eadbfb09e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strnlen.S
@@ -0,0 +1,433 @@
+/* Optimized strnlen implementation for POWER8 using a vmx loop.
+
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* It is implemented the following heuristic:
+ 1. Case maxlen <= 32: align the pointer to 8 bytes to loop through
+ reading doublewords. Uses the POWER7 algorithm.
+ 2. Case maxlen > 32: check for null bytes in the first 16 bytes using
+ unaligned accesses. Return length if found. Otherwise:
+ 2.1 Case maxlen < 64: deduct the bytes previously read, align
+ the pointer to 16 bytes and loop through reading quadwords
+ until find null bytes or reach maxlen.
+ 2.2 Case maxlen > 64: deduct the bytes previously read, align
+ the pointer to 64 bytes and set up a counter to loop through
+ reading in strides of 64 bytes. In case it finished the loop
+ with null bytes not found, process the remainder bytes by
+ switching to the loop to heuristic in 2.1. */
+
+#include <sysdep.h>
+
+/* Define default page size to 4KB. */
+#define PAGE_SIZE 4096
+
+/* The following macros implement Power ISA v2.07 opcodes
+ that could not be used directly into this code to the keep
+ compatibility with older binutils versions. */
+
+/* Move from vector register doubleword. */
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+
+/* Move to vector register doubleword. */
+#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+
+/* Vector Bit Permute Quadword. */
+#define VBPERMQ(t,a,b) .long (0x1000054c \
+ | ((t)<<(32-11)) \
+ | ((a)<<(32-16)) \
+ | ((b)<<(32-21)) )
+
+/* Vector Population Count Halfword. */
+#define VPOPCNTH(t,b) .long (0x10000743 | ((t)<<(32-11)) | ((b)<<(32-21)))
+
+/* Vector Count Leading Zeros Halfword. */
+#define VCLZH(t,b) .long (0x10000742 | ((t)<<(32-11)) | ((b)<<(32-21)))
+
+
+/* int [r3] strnlen (char *s [r3], size_t maxlen [r4]) */
+/* TODO: change to power8 when minimum required binutils allows it. */
+ .machine power7
+ENTRY (__strnlen)
+ CALL_MCOUNT 2
+ dcbt 0,r3
+
+ cmpldi r4,32 /* Check if maxlen <= 32. */
+ ble L(small_range) /* If maxlen <= 32. */
+
+ /* Upcoming 16 bytes unaligned accesses cannot cross the page boundary
+ otherwise the processor throws an memory access error.
+ Use following code to check there is room for such as accesses:
+ (((size_t) s) % PAGE_SIZE > (PAGE_SIZE - 16)
+ If it is disallowed then switch to the code that handles
+ the string when maxlen <= 32. */
+ clrldi r10,r3,52
+ cmpldi cr7,r10,PAGE_SIZE-16
+ bgt cr7,L(small_range) /* If less than 16B of page end. */
+
+ /* Compute our permute constant r8. */
+ li r7,0
+ /* Compute a bpermd constant to move bit 0 of each word into
+ a halfword value, and count trailing zeros. */
+#ifdef __LITTLE_ENDIAN__
+ li r8,0x2820
+ oris r8,r8,0x3830
+ sldi r8,r8,32
+ ori r8,r8,0x0800
+ oris r8,r8,0x1810
+#else
+ li r8,0x1018
+ oris r8,r8,0x0008
+ sldi r8,r8,32
+ ori r8,r8,0x3038
+ oris r8,r8,0x2028
+#endif
+
+ /* maxlen > 32. Optimistically check for null bytes in the first
+ 16 bytes of the string using unaligned accesses. */
+ ld r5,0(r3)
+ ld r6,8(r3)
+ cmpb r10,r7,r5 /* Check for null bytes in DWORD1. */
+ cmpb r11,r7,r6 /* Check for null bytes in DWORD2. */
+ or. r7,r10,r11
+ bne cr0, L(early_find) /* If found null bytes. */
+
+ /* At this point maxlen > 32 and null bytes were not found at first
+ 16 bytes. Prepare for loop using VMX. */
+
+ /* r3 == s, r4 == maxlen. All other volatile regs are unused now. */
+
+ addi r5,r3,16 /* Align up, or just add the 16B we
+ already checked. */
+ li r0,15
+ and r7,r5,r0 /* Find offset into 16B alignment. */
+ andc r5,r5,r0 /* Quadword align up s to the next quadword. */
+ li r0,16
+ subf r0,r7,r0
+ subf r4,r0,r4 /* Deduct unaligned bytes from maxlen. */
+
+
+ /* Compute offsets for vmx loads, and precompute the vbpermq
+ constants for both the 64B and 16B loops. */
+ li r6,0
+ vspltisb v0,0
+ vspltisb v10,3
+ lvsl v11,r6,r6
+ vslb v10,v11,v10
+
+ cmpldi r4,64 /* Check maxlen < 64. */
+ blt L(smaller) /* If maxlen < 64 */
+
+ /* In order to begin the 64B loop, it needs to be 64
+ bytes aligned. So read quadwords until it is aligned or found null
+ bytes. At worst case it will be aligned after the fourth iteration,
+ so unroll the loop to avoid counter checking. */
+ andi. r7,r5,63 /* Check if is 64 bytes aligned. */
+ beq cr0,L(preloop_64B) /* If it is already 64B aligned. */
+ lvx v1,r5,r6
+ vcmpequb. v1,v1,v0
+ addi r5,r5,16
+ addi r4,r4,-16 /* Decrement maxlen in 16 bytes. */
+ bne cr6,L(found_aligning64B) /* If found null bytes. */
+
+ /* Unroll 3x above code block until aligned or find null bytes. */
+ andi. r7,r5,63
+ beq cr0,L(preloop_64B)
+ lvx v1,r5,r6
+ vcmpequb. v1,v1,v0
+ addi r5,r5,16
+ addi r4,r4,-16
+ bne cr6,L(found_aligning64B)
+
+ andi. r7,r5,63
+ beq cr0,L(preloop_64B)
+ lvx v1,r5,r6
+ vcmpequb. v1,v1,v0
+ addi r5,r5,16
+ addi r4,r4,-16
+ bne cr6,L(found_aligning64B)
+
+ andi. r7,r5,63
+ beq cr0,L(preloop_64B)
+ lvx v1,r5,r6
+ vcmpequb. v1,v1,v0
+ addi r5,r5,16
+ addi r4,r4,-16
+ bne cr6,L(found_aligning64B)
+
+ /* At this point it should be 16 bytes aligned.
+ Prepare for the 64B loop. */
+ .p2align 4
+L(preloop_64B):
+ /* Check if maxlen became is less than 64, therefore disallowing the
+ 64B loop. If it happened switch to the 16B loop code. */
+ cmpldi r4,64 /* Check if maxlen < 64. */
+ blt L(smaller) /* If maxlen < 64. */
+ /* Set some constant values. */
+ li r7,16
+ li r10,32
+ li r9,48
+
+ /* Compute the number of 64 bytes iterations needed. */
+ srdi r11,r4,6 /* Compute loop count (maxlen / 64). */
+ andi. r4,r4,63 /* Set maxlen the remainder (maxlen % 64). */
+ mtctr r11 /* Move loop count to counter register. */
+
+ /* Handle maxlen > 64. Loop over the bytes in strides of 64B. */
+ .p2align 4
+L(loop_64B):
+ lvx v1,r5,r6 /* r5 is the pointer to s. */
+ lvx v2,r5,r7
+ lvx v3,r5,r10
+ lvx v4,r5,r9
+ /* Compare the four 16B vectors to obtain the least 16 values.
+ Null bytes should emerge into v7, then check for null bytes. */
+ vminub v5,v1,v2
+ vminub v6,v3,v4
+ vminub v7,v5,v6
+ vcmpequb. v7,v7,v0 /* Check for null bytes. */
+ addi r5,r5,64 /* Add pointer to next iteraction. */
+ bne cr6,L(found_64B) /* If found null bytes. */
+ bdnz L(loop_64B) /* Continue the loop if count > 0. */
+
+/* Hit loop end without null match. So branch to handle the remainder. */
+
+ /* Prepare a 16B loop to handle two cases:
+ 1. If 32 > maxlen < 64.
+ 2. If maxlen >= 64, and reached end of the 64B loop with null
+ bytes not found. Thus handle the remainder bytes here. */
+ .p2align 4
+L(smaller):
+ cmpldi r4,0 /* Check maxlen is zero. */
+ beq L(done) /* If maxlen is zero. */
+
+ /* Place rounded up number of qw's to check into a vmx
+ register, and use some vector tricks to minimize
+ branching. */
+ MTVRD(v7,r4) /* Copy maxlen from GPR to vector register. */
+ vspltisb v5,1
+ vspltisb v6,15
+ vspltb v2,v7,7
+ vaddubs v3,v5,v6
+
+#ifdef __LITTLE_ENDIAN__
+ vspltish v5,1 /* Compute 16 in each byte. */
+#endif
+
+ /* Loop in 16B aligned incremements now. */
+ .p2align 4
+L(loop_16B):
+ lvx v1,r5,r6 /* Load quadword into vector register. */
+ addi r5,r5,16 /* Increment address to next 16B block. */
+ vor v7,v2,v2 /* Save loop count (v2) into v7. */
+ vsububs v2,v2,v3 /* Subtract 16B from count, saturate at 0. */
+ vminub v4,v1,v2
+ vcmpequb. v4,v4,v0 /* Checking for null bytes. */
+ beq cr6,L(loop_16B) /* If null bytes not found. */
+
+ vcmpequb v1,v1,v0
+ VBPERMQ(v1,v1,v10)
+#ifdef __LITTLE_ENDIAN__
+ vsubuhm v2,v1,v5 /* Form a mask of trailing zeros. */
+ vandc v2,v2,v1
+ VPOPCNTH(v1,v2) /* Count of trailing zeros, 16 if none. */
+#else
+ VCLZH(v1,v1) /* Count the leading zeros, 16 if none. */
+#endif
+ /* Truncate to maximum allowable offset. */
+ vcmpgtub v2,v1,v7 /* Compare and truncate for matches beyond
+ maxlen. */
+ vsel v1,v1,v7,v2 /* 0-16 is now in byte 7. */
+
+ MFVRD(r0,v1)
+ addi r5,r5,-16 /* Undo speculative bump. */
+ extsb r0,r0 /* Clear whatever gunk is in the high 56b. */
+ add r5,r5,r0 /* Add the offset of whatever was found. */
+L(done):
+ subf r3,r3,r5 /* Length is equal to the offset of null byte
+ matched minus the pointer to s. */
+ blr /* Done. */
+
+ /* Handle case of maxlen > 64 and found null bytes in last block
+ of 64 bytes read. */
+ .p2align 4
+L(found_64B):
+ /* A zero was found. Reduce the result. */
+ vcmpequb v1,v1,v0
+ vcmpequb v2,v2,v0
+ vcmpequb v3,v3,v0
+ vcmpequb v4,v4,v0
+
+ /* Permute the first bit of each byte into bits 48-63. */
+ VBPERMQ(v1,v1,v10)
+ VBPERMQ(v2,v2,v10)
+ VBPERMQ(v3,v3,v10)
+ VBPERMQ(v4,v4,v10)
+
+ /* Shift each component into its correct position for merging. */
+#ifdef __LITTLE_ENDIAN__
+ vsldoi v2,v2,v2,2
+ vsldoi v3,v3,v3,4
+ vsldoi v4,v4,v4,6
+#else
+ vsldoi v1,v1,v1,6
+ vsldoi v2,v2,v2,4
+ vsldoi v3,v3,v3,2
+#endif
+
+ /* Merge the results and move to a GPR. */
+ vor v1,v2,v1
+ vor v2,v3,v4
+ vor v4,v1,v2
+
+ /* Adjust address to the start of the current 64B block. */
+ addi r5,r5,-64
+
+ MFVRD(r10,v4)
+#ifdef __LITTLE_ENDIAN__
+ addi r9,r10,-1 /* Form a mask from trailing zeros. */
+ andc r9,r9,r10
+ popcntd r0,r9 /* Count the bits in the mask. */
+#else
+ cntlzd r0,r10 /* Count leading zeros before the match. */
+#endif
+ subf r5,r3,r5
+ add r3,r5,r0 /* Compute final length. */
+ blr /* Done. */
+
+ /* Handle case where null bytes were found while aligning
+ as a preparation for the 64B loop. */
+ .p2align 4
+L(found_aligning64B):
+ VBPERMQ(v1,v1,v10)
+#ifdef __LITTLE_ENDIAN__
+ MFVRD(r10,v1)
+ addi r9,r10,-1 /* Form a mask from trailing zeros. */
+ andc r9,r9,r10
+ popcntd r0,r9 /* Count the bits in the mask. */
+#else
+ vsldoi v1,v1,v1,6
+ MFVRD(r10,v1)
+ cntlzd r0,r10 /* Count leading zeros before the match. */
+#endif
+ addi r5,r5,-16 /* Adjust address to offset of last 16 bytes
+ read. */
+ /* Calculate length as subtracted the pointer to s of last 16 bytes
+ offset, added with the bytes before the match. */
+ subf r5,r3,r5
+ add r3,r5,r0
+ blr /* Done. */
+
+ /* Handle case of maxlen > 32 and found a null bytes within the first
+ 16 bytes of s. */
+ .p2align 4
+L(early_find):
+ bpermd r5,r8,r10 /* r8 contains the bit permute constants. */
+ bpermd r6,r8,r11
+ sldi r5,r5,8
+ or r5,r5,r6 /* r5 should hold a 16B mask of
+ a potential 0. */
+ cntlzd r5,r5 /* Count leading zeros. */
+ addi r3,r5,-48 /* Deduct the 48 leading zeros always
+ present. */
+ blr /* Done. */
+
+ /* Handle case of maxlen <= 32. Use the POWER7 algorithm. */
+ .p2align 4
+L(small_range):
+ clrrdi r8,r3,3 /* Align the pointer to 8B. */
+ li r0,0
+ /* Register's content at this point:
+ r3 == pointer to s, r4 == maxlen, r8 == pointer to s aligned to 8B,
+ r7 == last acceptable address. */
+ cmpldi r4,0 /* Check if maxlen is zero. */
+ beq L(end_max) /* If maxlen is zero. */
+
+ /* Calculate the last acceptable address and check for possible
+ addition overflow by using satured math:
+ r7 = r3 + r4
+ r7 |= -(r7 < x) */
+ add r7,r3,r4
+ subfc r6,r3,r7
+ subfe r9,r9,r9
+ extsw r6,r9
+ or r7,r7,r6
+ addi r7,r7,-1
+
+ clrrdi r7,r7,3 /* Align to 8B address of last
+ acceptable address. */
+
+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
+ ld r12,0(r8) /* Load aligned doubleword. */
+ cmpb r10,r12,r0 /* Check for null bytes. */
+#ifdef __LITTLE_ENDIAN__
+ srd r10,r10,r6
+ sld r10,r10,r6
+#else
+ sld r10,r10,r6
+ srd r10,r10,r6
+#endif /* __LITTLE_ENDIAN__ */
+ cmpldi cr7,r10,0
+ bne cr7,L(done_small) /* If found null byte. */
+
+ cmpld r8,r7 /* Check if reached maxlen. */
+ beq L(end_max) /* If reached maxlen. */
+
+ /* Still handling case of maxlen <= 32. Read doubleword aligned until
+ find null bytes or reach maxlen. */
+ .p2align 4
+L(loop_small):
+ ldu r12,8(r8) /* Load next doubleword and update r8. */
+ cmpb r10,r12,r0 /* Check for null bytes. */
+ cmpldi cr6,r10,0
+ bne cr6,L(done_small) /* If found null bytes. */
+ cmpld r8,r7 /* Check if reached maxlen. */
+ bne L(loop_small) /* If it has more bytes to read. */
+ mr r3,r4 /* Reached maxlen with null bytes not found.
+ Length is equal to maxlen. */
+ blr /* Done. */
+
+ /* Still handling case of maxlen <= 32. Found null bytes.
+ Registers: r10 == match bits within doubleword, r8 == address of
+ last doubleword read, r3 == pointer to s, r4 == maxlen. */
+ .p2align 4
+L(done_small):
+#ifdef __LITTLE_ENDIAN__
+ /* Count trailing zeros. */
+ addi r0,r10,-1
+ andc r0,r0,r10
+ popcntd r0,r0
+#else
+ cntlzd r0,r10 /* Count leading zeros before the match. */
+#endif
+ sub r3,r8,r3 /* Calculate total of bytes before the match. */
+ srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
+ add r3,r3,r0 /* Length until the match. */
+ cmpld r3,r4 /* Check length is greater than maxlen. */
+ blelr
+ mr r3,r4 /* If length is greater than maxlen, return
+ maxlen. */
+ blr
+
+ /* Handle case of reached maxlen with null bytes not found. */
+ .p2align 4
+L(end_max):
+ mr r3,r4 /* Length is equal to maxlen. */
+ blr /* Done. */
+
+
+END (__strnlen)
+libc_hidden_def (__strnlen)
+weak_alias (__strnlen, strnlen)
+libc_hidden_def (strnlen)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strrchr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strrchr.S
new file mode 100644
index 0000000000..8eb74853c3
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strrchr.S
@@ -0,0 +1,464 @@
+/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* char *[r3] strrchr (char *s [r3], int c [r4]) */
+/* TODO: change these to the actual instructions when the minimum required
+ binutils allows it. */
+#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t,a,b) .long (0x1000054c \
+ | ((t)<<(32-11)) \
+ | ((a)<<(32-16)) \
+ | ((b)<<(32-21)) )
+#define VCLZD(r,v) .long (0x100007c2 | ((r)<<(32-11)) | ((v)<<(32-21)))
+#define VPOPCNTD(r,v) .long (0x100007c3 | ((r)<<(32-11)) | ((v)<<(32-21)))
+#define VADDUQM(t,a,b) .long (0x10000100 \
+ | ((t)<<(32-11)) \
+ | ((a)<<(32-16)) \
+ | ((b)<<(32-21)) )
+#ifdef __LITTLE_ENDIAN__
+/* Find the match position from v6 and place result in r6. */
+# define CALCULATE_MATCH() \
+ VBPERMQ(v6, v6, v10); \
+ vsldoi v6, v6, v6, 6; \
+ MFVRD(r7, v6); \
+ cntlzd r6, r7; \
+ subfic r6, r6, 15;
+/*
+ * Find the first null position to mask bytes after null.
+ * (reg): vcmpequb result: v2 for 1st qw v3 for 2nd qw.
+ * Result placed at v2.
+ */
+# define FIND_NULL_POS(reg) \
+ vspltisb v11, -1; \
+ VADDUQM(v11, reg, v11); \
+ vandc v11, v11, reg; \
+ VPOPCNTD(v2, v11); \
+ vspltb v11, v2, 15; \
+ vcmpequb. v11, v11, v9; \
+ blt cr6, 1f; \
+ vsldoi v9, v0, v9, 1; \
+ vslo v2, v2, v9; \
+1: \
+ vsumsws v2, v2, v0;
+#else
+# define CALCULATE_MATCH() \
+ VBPERMQ(v6, v6, v10); \
+ MFVRD(r7, v6); \
+ addi r6, r7, -1; \
+ andc r6, r6, r7; \
+ popcntd r6, r6; \
+ subfic r6, r6, 15;
+# define FIND_NULL_POS(reg) \
+ VCLZD(v2, reg); \
+ vspltb v11, v2, 7; \
+ vcmpequb. v11, v11, v9; \
+ blt cr6, 1f; \
+ vsldoi v9, v0, v9, 1; \
+ vsro v2, v2, v9; \
+1: \
+ vsumsws v2, v2, v0;
+#endif /* !__LITTLE_ENDIAN__ */
+ .machine power7
+ENTRY (strrchr)
+ CALL_MCOUNT 2
+ dcbt 0,r3
+ clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
+ cmpdi cr7,r4,0
+ ld r12,0(r8) /* Load doubleword from memory. */
+ li r9,0 /* Used to store last occurence. */
+ li r0,0 /* Doubleword with null chars to use
+ with cmpb. */
+
+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
+
+ beq cr7,L(null_match)
+
+ /* Replicate byte to doubleword. */
+ insrdi r4,r4,8,48
+ insrdi r4,r4,16,32
+ insrdi r4,r4,32,0
+
+ /* r4 is changed now. If it's passed more chars, then
+ check for null again. */
+ cmpdi cr7,r4,0
+ beq cr7,L(null_match)
+ /* Now r4 has a doubleword of c bytes and r0 has
+ a doubleword of null bytes. */
+
+ cmpb r10,r12,r4 /* Compare each byte against c byte. */
+ cmpb r11,r12,r0 /* Compare each byte against null byte. */
+
+ /* Move the doublewords left and right to discard the bits that are
+ not part of the string and bring them back as zeros. */
+#ifdef __LITTLE_ENDIAN__
+ srd r10,r10,r6
+ srd r11,r11,r6
+ sld r10,r10,r6
+ sld r11,r11,r6
+#else
+ sld r10,r10,r6
+ sld r11,r11,r6
+ srd r10,r10,r6
+ srd r11,r11,r6
+#endif
+ or r5,r10,r11 /* OR the results to speed things up. */
+ cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
+ have been found. */
+ bne cr7,L(done)
+
+L(align):
+ andi. r12, r8, 15
+
+ /* Are we now aligned to a doubleword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bne cr0, L(loop)
+
+ /* Handle WORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r10,r12,r4
+ cmpb r11,r12,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ bne cr7,L(done)
+ b L(loop) /* We branch here (rather than falling through)
+ to skip the nops due to heavy alignment
+ of the loop below. */
+ .p2align 5
+L(loop):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+ ld r12,8(r8)
+ ldu r7,16(r8)
+ cmpb r10,r12,r4
+ cmpb r11,r12,r0
+ cmpb r6,r7,r4
+ cmpb r7,r7,r0
+ or r12,r10,r11
+ or r5,r6,r7
+ or r5,r12,r5
+ cmpdi cr7,r5,0
+ beq cr7,L(vector)
+
+ /* OK, one (or both) of the doublewords contains a c/null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a c/null byte. */
+ cmpdi cr6,r12,0
+ addi r8,r8,-8
+ bne cr6,L(done)
+
+ /* The c/null byte must be in the second doubleword. Adjust the
+ address again and move the result of cmpb to r10 so we can calculate
+ the pointer. */
+
+ mr r10,r6
+ mr r11,r7
+ addi r8,r8,8
+
+ /* r10/r11 have the output of the cmpb instructions, that is,
+ 0xff in the same position as the c/null byte in the original
+ doubleword from the string. Use that to calculate the pointer. */
+
+L(done):
+ /* If there are more than one 0xff in r11, find the first position of
+ 0xff in r11 and fill r10 with 0 from that position. */
+ cmpdi cr7,r11,0
+ beq cr7,L(no_null)
+#ifdef __LITTLE_ENDIAN__
+ addi r3,r11,-1
+ andc r3,r3,r11
+ popcntd r0,r3
+#else
+ cntlzd r0,r11
+#endif
+ subfic r0,r0,63
+ li r6,-1
+#ifdef __LITTLE_ENDIAN__
+ srd r0,r6,r0
+#else
+ sld r0,r6,r0
+#endif
+ and r10,r0,r10
+L(no_null):
+#ifdef __LITTLE_ENDIAN__
+ cntlzd r0,r10 /* Count leading zeros before c matches. */
+ addi r3,r10,-1
+ andc r3,r3,r10
+ addi r10,r11,-1
+ andc r10,r10,r11
+ cmpld cr7,r3,r10
+ bgt cr7,L(no_match)
+#else
+ addi r3,r10,-1 /* Count trailing zeros before c matches. */
+ andc r3,r3,r10
+ popcntd r0,r3
+ cmpld cr7,r11,r10
+ bgt cr7,L(no_match)
+#endif
+ srdi r0,r0,3 /* Convert trailing zeros to bytes. */
+ subfic r0,r0,7
+ add r9,r8,r0 /* Return address of the matching c byte
+ or null in case c was not found. */
+ li r0,0
+ cmpdi cr7,r11,0 /* If r11 == 0, no null's have been found. */
+ beq cr7,L(align)
+
+ .align 4
+L(no_match):
+ mr r3,r9
+ blr
+
+/* Check the first 32B in GPR's and move to vectorized loop. */
+ .p2align 5
+L(vector):
+ addi r3, r8, 8
+ /* Make sure 32B aligned. */
+ andi. r10, r3, 31
+ bne cr0, L(loop)
+ vspltisb v0, 0
+ /* Precompute vbpermq constant. */
+ vspltisb v10, 3
+ lvsl v11, r0, r0
+ vslb v10, v11, v10
+ MTVRD(v1, r4)
+ li r5, 16
+ vspltb v1, v1, 7
+ /* Compare 32 bytes in each loop. */
+L(continue):
+ lvx v4, 0, r3
+ lvx v5, r3, r5
+ vcmpequb v2, v0, v4
+ vcmpequb v3, v0, v5
+ vcmpequb v6, v1, v4
+ vcmpequb v7, v1, v5
+ vor v8, v2, v3
+ vor v9, v6, v7
+ vor v11, v8, v9
+ vcmpequb. v11, v0, v11
+ addi r3, r3, 32
+ blt cr6, L(continue)
+ vcmpequb. v8, v0, v8
+ blt cr6, L(match)
+
+ /* One (or both) of the quadwords contains c/null. */
+ vspltisb v8, 2
+ vspltisb v9, 5
+ /* Precompute values used for comparison. */
+ vsl v9, v8, v9 /* v9 = 0x4040404040404040. */
+ vaddubm v8, v9, v9
+ vsldoi v8, v0, v8, 1 /* v8 = 0x80. */
+
+ /* Check if null is in second qw. */
+ vcmpequb. v11, v0, v2
+ blt cr6, L(secondqw)
+
+ /* Null found in first qw. */
+ addi r8, r3, -32
+ /* Calculate the null position. */
+ FIND_NULL_POS(v2)
+ /* Check if null is in the first byte. */
+ vcmpequb. v11, v0, v2
+ blt cr6, L(no_match)
+ vsububm v2, v8, v2
+ /* Mask unwanted bytes after null. */
+#ifdef __LITTLE_ENDIAN__
+ vslo v6, v6, v2
+ vsro v6, v6, v2
+#else
+ vsro v6, v6, v2
+ vslo v6, v6, v2
+#endif
+ vcmpequb. v11, v0, v6
+ blt cr6, L(no_match)
+ /* Found a match before null. */
+ CALCULATE_MATCH()
+ add r3, r8, r6
+ blr
+
+L(secondqw):
+ addi r8, r3, -16
+ FIND_NULL_POS(v3)
+ vcmpequb. v11, v0, v2
+ blt cr6, L(no_match1)
+ vsububm v2, v8, v2
+ /* Mask unwanted bytes after null. */
+#ifdef __LITTLE_ENDIAN__
+ vslo v7, v7, v2
+ vsro v7, v7, v2
+#else
+ vsro v7, v7, v2
+ vslo v7, v7, v2
+#endif
+ vcmpequb. v11, v0, v7
+ blt cr6, L(no_match1)
+ addi r8, r8, 16
+ vor v6, v0, v7
+L(no_match1):
+ addi r8, r8, -16
+ vcmpequb. v11, v0, v6
+ blt cr6, L(no_match)
+ /* Found a match before null. */
+ CALCULATE_MATCH()
+ add r3, r8, r6
+ blr
+
+L(match):
+ /* One (or both) of the quadwords contains a match. */
+ mr r8, r3
+ vcmpequb. v8, v0, v7
+ blt cr6, L(firstqw)
+ /* Match found in second qw. */
+ addi r8, r8, 16
+ vor v6, v0, v7
+L(firstqw):
+ addi r8, r8, -32
+ CALCULATE_MATCH()
+ add r9, r8, r6 /* Compute final length. */
+ b L(continue)
+/* We are here because strrchr was called with a null byte. */
+ .align 4
+L(null_match):
+ /* r0 has a doubleword of null bytes. */
+
+ cmpb r5,r12,r0 /* Compare each byte against null bytes. */
+
+ /* Move the doublewords left and right to discard the bits that are
+ not part of the string and bring them back as zeros. */
+#ifdef __LITTLE_ENDIAN__
+ srd r5,r5,r6
+ sld r5,r5,r6
+#else
+ sld r5,r5,r6
+ srd r5,r5,r6
+#endif
+ cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
+ have been found. */
+ bne cr7,L(done_null)
+
+ andi. r12, r8, 15
+
+ /* Are we now aligned to a quadword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bne cr0, L(loop_null)
+
+ /* Handle WORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r5,r12,r0
+ cmpdi cr7,r5,0
+ bne cr7,L(done_null)
+ b L(loop_null) /* We branch here (rather than falling through)
+ to skip the nops due to heavy alignment
+ of the loop below. */
+
+ /* Main loop to look for the end of the string. Since it's a
+ small loop (< 8 instructions), align it to 32-bytes. */
+ .p2align 5
+L(loop_null):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+ ld r12,8(r8)
+ ldu r11,16(r8)
+ cmpb r5,r12,r0
+ cmpb r10,r11,r0
+ or r6,r5,r10
+ cmpdi cr7,r6,0
+ beq cr7,L(vector1)
+
+ /* OK, one (or both) of the doublewords contains a null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a null byte. */
+
+ cmpdi cr6,r5,0
+ addi r8,r8,-8
+ bne cr6,L(done_null)
+
+ /* The null byte must be in the second doubleword. Adjust the address
+ again and move the result of cmpb to r10 so we can calculate the
+ pointer. */
+
+ mr r5,r10
+ addi r8,r8,8
+
+ /* r5 has the output of the cmpb instruction, that is, it contains
+ 0xff in the same position as the null byte in the original
+ doubleword from the string. Use that to calculate the pointer. */
+L(done_null):
+#ifdef __LITTLE_ENDIAN__
+ addi r0,r5,-1
+ andc r0,r0,r5
+ popcntd r0,r0
+#else
+ cntlzd r0,r5 /* Count leading zeros before the match. */
+#endif
+ srdi r0,r0,3 /* Convert trailing zeros to bytes. */
+ add r3,r8,r0 /* Return address of the matching null byte. */
+ blr
+/* Check the first 32B in GPR's and move to vectorized loop. */
+ .p2align 5
+L(vector1):
+ addi r3, r8, 8
+ /* Make sure 32B aligned. */
+ andi. r10, r3, 31
+ bne cr0, L(loop_null)
+ vspltisb v0, 0
+ /* Precompute vbpermq constant. */
+ vspltisb v10, 3
+ lvsl v11, r0, r0
+ vslb v10, v11, v10
+ li r5, 16
+ /* Compare 32 bytes in each loop. */
+L(continue1):
+ lvx v4, 0, r3
+ lvx v5, r3, r5
+ vcmpequb v2, v0, v4
+ vcmpequb v3, v0, v5
+ vor v8, v2, v3
+ vcmpequb. v11, v0, v8
+ addi r3, r3, 32
+ blt cr6, L(continue1)
+ addi r3, r3, -32
+ VBPERMQ(v2, v2, v10)
+ VBPERMQ(v3, v3, v10)
+ /* Shift each component into its correct position for merging. */
+#ifdef __LITTLE_ENDIAN__
+ vsldoi v3, v3, v3, 2
+#else
+ vsldoi v2, v2, v2, 6
+ vsldoi v3, v3, v3, 4
+#endif
+ /* Merge the results and move to a GPR. */
+ vor v4, v3, v2
+ MFVRD(r5, v4)
+#ifdef __LITTLE_ENDIAN__
+ addi r6, r5, -1
+ andc r6, r6, r5
+ popcntd r6, r6
+#else
+ cntlzd r6, r5 /* Count leading zeros before the match. */
+#endif
+ add r3, r3, r6 /* Compute final length. */
+ blr
+END (strrchr)
+weak_alias (strrchr, rindex)
+libc_hidden_builtin_def (strrchr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strspn.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strspn.S
new file mode 100644
index 0000000000..e9271898f2
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strspn.S
@@ -0,0 +1,202 @@
+/* Optimized strspn implementation for Power8.
+
+ Copyright (C) 2016-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* size_t [r3] strspn (const char *string [r3],
+ const char *needleAccept [r4]) */
+
+/* This takes a novel approach by computing a 256 bit mask whereby
+ each set bit implies the byte is "accepted". P8 vector hardware
+ has extremely efficient hardware for selecting bits from a mask.
+
+ One might ask "why not use bpermd for short strings"? It is
+ so slow that its performance about matches the generic PPC64
+ variant without any fancy masking, with the added expense of
+ making the mask. That was the first variant of this. */
+
+
+
+#include "sysdep.h"
+
+#ifndef USE_AS_STRCSPN
+# define USE_AS_STRCSPN 0
+# ifndef STRSPN
+# define STRSPN strspn
+# endif
+# define INITIAL_MASK 0
+# define UPDATE_MASK(RA, RS, RB) or RA, RS, RB
+#else
+# ifndef STRSPN
+# define STRSPN strcspn
+# endif
+# define INITIAL_MASK -1
+# define UPDATE_MASK(RA, RS, RB) andc RA, RS, RB
+#endif
+
+/* Simple macro to use VSX instructions in overlapping VR's. */
+#define XXVR(insn, vrt, vra, vrb) \
+ insn 32+vrt, 32+vra, 32+vrb
+
+/* ISA 2.07B instructions are not all defined for older binutils.
+ Macros are defined below for these newer instructions in order
+ to maintain compatibility. */
+
+/* Note, TX/SX is always set as VMX regs are the high 32 VSX regs. */
+#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+
+#define VBPERMQ(t,a,b) .long (0x1000054c \
+ | ((t)<<(32-11)) \
+ | ((a)<<(32-16)) \
+ | ((b)<<(32-21)) )
+
+ /* This can be updated to power8 once the minimum version of
+ binutils supports power8 and the above instructions. */
+ .machine power7
+EALIGN(STRSPN, 4, 0)
+ CALL_MCOUNT 2
+
+ /* Generate useful constants for later on. */
+ vspltisb v1, 7
+ vspltisb v2, -1
+ vslb v1, v1, v1 /* 0x80 to swap high bit for vbpermq. */
+ vspltisb v10, 0
+ vsldoi v4, v10, v2, 2 /* 0xFFFF into vr4. */
+ XXVR(xxmrgld, v4, v4, v10) /* Mask for checking matches. */
+
+ /* Prepare to compute 256b mask. */
+ addi r4, r4, -1
+ li r5, INITIAL_MASK
+ li r6, INITIAL_MASK
+ li r7, INITIAL_MASK
+ li r8, INITIAL_MASK
+
+#if USE_AS_STRCSPN
+ /* Ensure the null character never matches by clearing ISA bit 0 in
+ in r5 which is the bit which will check for it in the later usage
+ of vbpermq. */
+ srdi r5, r5, 1
+#endif
+
+ li r11, 1
+ sldi r11, r11, 63
+
+ /* Start interleaved Mask computation.
+ This will eventually or 1's into ignored bits from vbpermq. */
+ lvsr v11, 0, r3
+ vspltb v11, v11, 0 /* Splat shift constant. */
+
+ /* Build a 256b mask in r5-r8. */
+ .align 4
+L(next_needle):
+ lbzu r9, 1(r4)
+
+ cmpldi cr0, r9, 0
+ cmpldi cr1, r9, 128
+
+ /* This is a little tricky. srd only uses the first 7 bits,
+ and if bit 7 is set, value is always 0. So, we can
+ effectively shift 128b in this case. */
+ xori r12, r9, 0x40 /* Invert bit 6. */
+ srd r10, r11, r9 /* Mask for bits 0-63. */
+ srd r12, r11, r12 /* Mask for bits 64-127. */
+
+ beq cr0, L(start_cmp)
+
+ /* Now, or the value into the correct GPR. */
+ bge cr1,L(needle_gt128)
+ UPDATE_MASK (r5, r5, r10) /* 0 - 63. */
+ UPDATE_MASK (r6, r6, r12) /* 64 - 127. */
+ b L(next_needle)
+
+ .align 4
+L(needle_gt128):
+ UPDATE_MASK (r7, r7, r10) /* 128 - 191. */
+ UPDATE_MASK (r8, r8, r12) /* 192 - 255. */
+ b L(next_needle)
+
+
+ .align 4
+L(start_cmp):
+ /* Move and merge bitmap into 2 VRs. bpermd is slower on P8. */
+ mr r0, r3 /* Save r3 for final length computation. */
+ MTVRD (v5, r5)
+ MTVRD (v6, r6)
+ MTVRD (v7, r7)
+ MTVRD (v8, r8)
+
+ /* Continue interleaved mask generation. */
+#ifdef __LITTLE_ENDIAN__
+ vsrw v11, v2, v11 /* Note, shift ignores higher order bits. */
+ vsplth v11, v11, 0 /* Only care about the high 16 bits of v10. */
+#else
+ vslw v11, v2, v11 /* Note, shift ignores higher order bits. */
+ vsplth v11, v11, 1 /* Only care about the low 16 bits of v10. */
+#endif
+ lvx v0, 0, r3 /* Note, unaligned load ignores lower bits. */
+
+ /* Do the merging of the bitmask. */
+ XXVR(xxmrghd, v5, v5, v6)
+ XXVR(xxmrghd, v6, v7, v8)
+
+ /* Finish mask generation. */
+ vand v11, v11, v4 /* Throwaway bits not in the mask. */
+
+ /* Compare the first 1-16B, while masking unwanted bytes. */
+ clrrdi r3, r3, 4 /* Note, counts from qw boundaries. */
+ vxor v9, v0, v1 /* Swap high bit. */
+ VBPERMQ (v8, v5, v0)
+ VBPERMQ (v7, v6, v9)
+ vor v7, v7, v8
+ vor v7, v7, v11 /* Ignore non-participating bytes. */
+ vcmpequh. v8, v7, v4
+ bnl cr6, L(done)
+
+ addi r3, r3, 16
+
+ .align 4
+L(vec):
+ lvx v0, 0, r3
+ addi r3, r3, 16
+ vxor v9, v0, v1 /* Swap high bit. */
+ VBPERMQ (v8, v5, v0)
+ VBPERMQ (v7, v6, v9)
+ vor v7, v7, v8
+ vcmpequh. v8, v7, v4
+ blt cr6, L(vec)
+
+ addi r3, r3, -16
+L(done):
+ subf r3, r0, r3
+ MFVRD (r10, v7)
+
+#ifdef __LITTLE_ENDIAN__
+ addi r0, r10, 1 /* Count the trailing 1's. */
+ andc r10, r10, r0
+ popcntd r10, r10
+#else
+ xori r10, r10, 0xffff /* Count leading 1's by inverting. */
+ addi r3, r3, -48 /* Account for the extra leading zeros. */
+ cntlzd r10, r10
+#endif
+
+ add r3, r3, r10
+ blr
+
+END(STRSPN)
+libc_hidden_builtin_def (STRSPN)