35 files changed, 7733 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/Implies
new file mode 100644
index 0000000000..9a5e3c7277
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc64/power7/fpu
+powerpc/powerpc64/power7
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/Makefile b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/Makefile
new file mode 100644
index 0000000000..71a59529f3
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/Makefile
@@ -0,0 +1,3 @@
+ifeq ($(subdir),string)
+sysdep_routines += strcasestr-ppc64
+endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/Implies
new file mode 100644
index 0000000000..1187cdfb0a
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power7/fpu/
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S
new file mode 100644
index 0000000000..4c42926a74
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S
@@ -0,0 +1,303 @@
+/* Optimized expf().  PowerPC64/POWER8 version.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Short algorithm description:
+ *
+ *  Let K = 64 (table size).
+ *       e^x  = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y))
+ *  where:
+ *       x = m*log(2)/K + y,    y in [0.0..log(2)/K]
+ *       m = n*K + j,           m,n,j - signed integer, j in [0..K-1]
+ *       values of 2^(j/K) are tabulated as T[j].
+ *
+ *       P(y) is a minimax polynomial approximation of expf(y)-1
+ *       on small interval [0.0..log(2)/K].
+ *
+ *       P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as
+ *       z = y*y;    P(y) = (P3*z + P1)*z + (P2*z + P0)*y
+ *
+ * Special cases:
+ *  expf(NaN) = NaN
+ *  expf(+INF) = +INF
+ *  expf(-INF) = 0
+ *  expf(x) = 1 for subnormals
+ *  for finite argument, only expf(0)=1 is exact
+ *  expf(x) overflows if x>88.7228317260742190
+ *  expf(x) underflows if x<-103.972076416015620
+ */
+
+#define C1 0x42ad496b		/* Single precision 125*log(2).  */
+#define C2 0x31800000		/* Single precision 2^(-28).  */
+#define SP_INF 0x7f800000	/* Single precision Inf.  */
+#define SP_EXP_BIAS 0x1fc0	/* Single precision exponent bias.  */
+
+#define DATA_OFFSET r9
+
+/* Implements the function
+
+   float [fp1] expf (float [fp1] x)  */
+
+	.machine power8
+EALIGN(__ieee754_expf, 4, 0)
+	addis	DATA_OFFSET,r2,.Lanchor@toc@ha
+	addi	DATA_OFFSET,DATA_OFFSET,.Lanchor@toc@l
+
+	xscvdpspn v0,v1
+	mfvsrd	r8,v0		/* r8 = x  */
+	lfd	fp2,(.KLN2-.Lanchor)(DATA_OFFSET)
+	lfd	fp3,(.P2-.Lanchor)(DATA_OFFSET)
+	rldicl	r3,r8,32,33	/* r3 = |x|  */
+	lis	r4,C1@ha	/* r4 = 125*log(2)  */
+	ori	r4,r4,C1@l
+	cmpw	r3,r4
+	lfd	fp5,(.P3-.Lanchor)(DATA_OFFSET)
+	lfd	fp4,(.RS-.Lanchor)(DATA_OFFSET)
+	fmadd	fp2,fp1,fp2,fp4	/* fp2 = x * K/log(2) + (2^23 + 2^22)  */
+	bge	L(special_paths)	/* |x| >= 125*log(2) ?  */
+
+	lis	r4,C2@ha
+	ori	r4,r4,C2@l
+	cmpw	r3,r4
+	blt	L(small_args)	/* |x| < 2^(-28) ?  */
+
+	/* Main path: here if 2^(-28) <= |x| < 125*log(2) */
+	frsp	fp6,fp2
+	xscvdpsp v2,v2
+	mfvsrd	r8,v2
+	mr	r3,r8			/* r3 = m  */
+	rldicl	r8,r8,32,58		/* r8 = j  */
+	lfs	fp4,(.SP_RS-.Lanchor)(DATA_OFFSET)
+	fsubs	fp2,fp6,fp4		/* fp2 = m = x * K/log(2)  */
+	srdi	r3,r3,32
+	clrrwi	r3,r3,6			/* r3 = n  */
+	lfd	fp6,(.NLN2K-.Lanchor)(DATA_OFFSET)
+	fmadd	fp0,fp2,fp6,fp1		/* fp0 = y = x - m*log(2)/K  */
+	fmul	fp2,fp0,fp0		/* fp2 = z = y^2  */
+	lfd	fp4,(.P1-.Lanchor)(DATA_OFFSET)
+	lfd	fp6,(.P0-.Lanchor)(DATA_OFFSET)
+	lis	r4,SP_EXP_BIAS@ha
+	ori	r4,r4,SP_EXP_BIAS@l
+	add	r3,r3,r4
+	rldic	r3,r3,49,1		/* r3 = 2^n  */
+	fmadd	fp4,fp5,fp2,fp4		/* fp4 = P3 * z + P1  */
+	fmadd	fp6,fp3,fp2,fp6		/* fp6 = P2 * z + P0  */
+	mtvsrd	v1,r3
+	xscvspdp v1,v1
+	fmul	fp4,fp4,fp2		/* fp4 = (P3 * z + P1)*z  */
+	fmadd	fp0,fp0,fp6,fp4		/* fp0 = P(y)  */
+	sldi	r8,r8,3			/* Access doublewords from T[j].  */
+	addi	r6,DATA_OFFSET,(.Ttable-.Lanchor)
+	lfdx	fp3,r6,r8
+	fmadd	fp0,fp0,fp3,fp3		/* fp0 = T[j] * (1 + P(y))  */
+	fmul	fp1,fp1,fp0		/* fp1 = 2^n * T[j] * (1 + P(y))  */
+	frsp	fp1,fp1
+	blr
+
+	.align	4
+/* x is either underflow, overflow, infinite or NaN.  */
+L(special_paths):
+	srdi	r8,r8,32
+	rlwinm	r8,r8,3,29,29		/* r8 = 0, if x positive.
+					   r8 = 4, otherwise.  */
+	addi	r6,DATA_OFFSET,(.SPRANGE-.Lanchor)
+	lwzx	r4,r6,r8		/* r4 = .SPRANGE[signbit(x)]  */
+	cmpw	r3,r4
+	/* |x| <= .SPRANGE[signbit(x)]  */
+	ble	L(near_under_or_overflow)
+
+	lis	r4,SP_INF@ha
+	ori	r4,r4,SP_INF@l
+	cmpw	r3,r4
+	bge	L(arg_inf_or_nan)	/* |x| > Infinite ?  */
+
+	addi	r6,DATA_OFFSET,(.SPLARGE_SMALL-.Lanchor)
+	lfsx	fp1,r6,r8
+	fmuls	fp1,fp1,fp1
+	blr
+
+
+	.align	4
+L(small_args):
+	/* expf(x) = 1.0, where |x| < |2^(-28)|  */
+	lfs	fp2,(.SPone-.Lanchor)(DATA_OFFSET)
+	fadds	fp1,fp1,fp2
+	blr
+
+
+	.align	4
+L(arg_inf_or_nan:)
+	bne	L(arg_nan)
+
+	/* expf(+INF) = +INF
+	   expf(-INF) = 0  */
+	addi	r6,DATA_OFFSET,(.INF_ZERO-.Lanchor)
+	lfsx	fp1,r6,r8
+	blr
+
+
+	.align	4
+L(arg_nan):
+	/* expf(NaN) = NaN  */
+	fadd	fp1,fp1,fp1
+	frsp	fp1,fp1
+	blr
+
+	.align	4
+L(near_under_or_overflow):
+	frsp	fp6,fp2
+	xscvdpsp v2,v2
+	mfvsrd	r8,v2
+	mr	r3,r8			/* r3 = m  */
+	rldicl	r8,r8,32,58		/* r8 = j  */
+	lfs	fp4,(.SP_RS-.Lanchor)(DATA_OFFSET)
+	fsubs	fp2,fp6,fp4		/* fp2 = m = x * K/log(2)  */
+	srdi	r3,r3,32
+	clrrwi	r3,r3,6			/* r3 = n  */
+	lfd	fp6,(.NLN2K-.Lanchor)(DATA_OFFSET)
+	fmadd	fp0,fp2,fp6,fp1		/* fp0 = y = x - m*log(2)/K  */
+	fmul	fp2,fp0,fp0		/* fp2 = z = y^2  */
+	lfd	fp4,(.P1-.Lanchor)(DATA_OFFSET)
+	lfd	fp6,(.P0-.Lanchor)(DATA_OFFSET)
+	ld	r4,(.DP_EXP_BIAS-.Lanchor)(DATA_OFFSET)
+	add	r3,r3,r4
+	rldic	r3,r3,46,1		/* r3 = 2  */
+	fmadd	fp4,fp5,fp2,fp4		/* fp4 = P3 * z + P1  */
+	fmadd	fp6,fp3,fp2,fp6		/* fp6 = P2 * z + P0  */
+	mtvsrd	v1,r3
+	fmul	fp4,fp4,fp2		/* fp4 = (P3*z + P1)*z  */
+	fmadd	fp0,fp0,fp6,fp4		/* fp0 = P(y)  */
+	sldi	r8,r8,3			/* Access doublewords from T[j].  */
+	addi	r6,DATA_OFFSET,(.Ttable-.Lanchor)
+	lfdx	fp3,r6,r8
+	fmadd	fp0,fp0,fp3,fp3		/* fp0 = T[j] * (1 + T[j])  */
+	fmul	fp1,fp1,fp0		/* fp1 = 2^n * T[j] * (1 + T[j])  */
+	frsp	fp1,fp1
+	blr
+END(__ieee754_expf)
+
+	.section .rodata, "a",@progbits
+.Lanchor:
+	.balign	8
+/* Table T[j] = 2^(j/K).  Double precision.  */
+.Ttable:
+	.8byte	0x3ff0000000000000
+	.8byte	0x3ff02c9a3e778061
+	.8byte	0x3ff059b0d3158574
+	.8byte	0x3ff0874518759bc8
+	.8byte	0x3ff0b5586cf9890f
+	.8byte	0x3ff0e3ec32d3d1a2
+	.8byte	0x3ff11301d0125b51
+	.8byte	0x3ff1429aaea92de0
+	.8byte	0x3ff172b83c7d517b
+	.8byte	0x3ff1a35beb6fcb75
+	.8byte	0x3ff1d4873168b9aa
+	.8byte	0x3ff2063b88628cd6
+	.8byte	0x3ff2387a6e756238
+	.8byte	0x3ff26b4565e27cdd
+	.8byte	0x3ff29e9df51fdee1
+	.8byte	0x3ff2d285a6e4030b
+	.8byte	0x3ff306fe0a31b715
+	.8byte	0x3ff33c08b26416ff
+	.8byte	0x3ff371a7373aa9cb
+	.8byte	0x3ff3a7db34e59ff7
+	.8byte	0x3ff3dea64c123422
+	.8byte	0x3ff4160a21f72e2a
+	.8byte	0x3ff44e086061892d
+	.8byte	0x3ff486a2b5c13cd0
+	.8byte	0x3ff4bfdad5362a27
+	.8byte	0x3ff4f9b2769d2ca7
+	.8byte	0x3ff5342b569d4f82
+	.8byte	0x3ff56f4736b527da
+	.8byte	0x3ff5ab07dd485429
+	.8byte	0x3ff5e76f15ad2148
+	.8byte	0x3ff6247eb03a5585
+	.8byte	0x3ff6623882552225
+	.8byte	0x3ff6a09e667f3bcd
+	.8byte	0x3ff6dfb23c651a2f
+	.8byte	0x3ff71f75e8ec5f74
+	.8byte	0x3ff75feb564267c9
+	.8byte	0x3ff7a11473eb0187
+	.8byte	0x3ff7e2f336cf4e62
+	.8byte	0x3ff82589994cce13
+	.8byte	0x3ff868d99b4492ed
+	.8byte	0x3ff8ace5422aa0db
+	.8byte	0x3ff8f1ae99157736
+	.8byte	0x3ff93737b0cdc5e5
+	.8byte	0x3ff97d829fde4e50
+	.8byte	0x3ff9c49182a3f090
+	.8byte	0x3ffa0c667b5de565
+	.8byte	0x3ffa5503b23e255d
+	.8byte	0x3ffa9e6b5579fdbf
+	.8byte	0x3ffae89f995ad3ad
+	.8byte	0x3ffb33a2b84f15fb
+	.8byte	0x3ffb7f76f2fb5e47
+	.8byte	0x3ffbcc1e904bc1d2
+	.8byte	0x3ffc199bdd85529c
+	.8byte	0x3ffc67f12e57d14b
+	.8byte	0x3ffcb720dcef9069
+	.8byte	0x3ffd072d4a07897c
+	.8byte	0x3ffd5818dcfba487
+	.8byte	0x3ffda9e603db3285
+	.8byte	0x3ffdfc97337b9b5f
+	.8byte	0x3ffe502ee78b3ff6
+	.8byte	0x3ffea4afa2a490da
+	.8byte	0x3ffefa1bee615a27
+	.8byte	0x3fff50765b6e4540
+	.8byte	0x3fffa7c1819e90d8
+
+.KLN2:
+	.8byte	0x40571547652b82fe	/* Double precision K/log(2).  */
+
+/* Double precision polynomial coefficients.  */
+.P0:
+	.8byte	0x3fefffffffffe7c6
+.P1:
+	.8byte	0x3fe00000008d6118
+.P2:
+	.8byte	0x3fc55550da752d4f
+.P3:
+	.8byte	0x3fa56420eb78fa85
+
+.RS:
+	.8byte	0x4168000000000000	/* Double precision 2^23 + 2^22.  */
+.NLN2K:
+	.8byte	0xbf862e42fefa39ef	/* Double precision -log(2)/K.  */
+.DP_EXP_BIAS:
+	.8byte	0x000000000000ffc0	/* Double precision exponent bias.  */
+
+	.balign	4
+.SPone:
+	.4byte	0x3f800000	/* Single precision 1.0.  */
+.SP_RS:
+	.4byte	0x4b400000	/* Single precision 2^23 + 2^22.  */
+
+.SPRANGE: /* Single precision overflow/underflow bounds.  */
+	.4byte	0x42b17217	/* if x>this bound, then result overflows.  */
+	.4byte	0x42cff1b4	/* if x<this bound, then result underflows.  */
+
+.SPLARGE_SMALL:
+	.4byte	0x71800000	/* 2^100.  */
+	.4byte	0x0d800000	/* 2^-100.  */
+
+.INF_ZERO:
+	.4byte	0x7f800000	/* Single precision Inf.  */
+	.4byte	0		/* Single precision zero.  */
+
+strong_alias (__ieee754_expf, __expf_finite)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies
new file mode 100644
index 0000000000..7fd86fdf87
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power7/fpu/multiarch
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S
new file mode 100644
index 0000000000..8dfa0076e0
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S
@@ -0,0 +1,508 @@
+/* Optimized cosf().  PowerPC64/POWER8 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#define _ERRNO_H	1
+#include <bits/errno.h>
+
+#define FRAMESIZE (FRAME_MIN_SIZE+16)
+
+#define FLOAT_EXPONENT_SHIFT	23
+#define FLOAT_EXPONENT_BIAS	127
+#define INTEGER_BITS		3
+
+#define PI_4		0x3f490fdb	/* PI/4 */
+#define NINEPI_4	0x40e231d6	/* 9 * PI/4 */
+#define TWO_PN5		0x3d000000	/* 2^-5 */
+#define TWO_PN27	0x32000000	/* 2^-27 */
+#define INFINITY	0x7f800000
+#define TWO_P23		0x4b000000	/* 2^23 */
+#define FX_FRACTION_1_28 0x9249250	/* 0x100000000 / 28 + 1 */
+
+	/* Implements the function
+
+	   float [fp1] cosf (float [fp1] x)  */
+
+	.machine power8
+EALIGN(__cosf, 4, 0)
+	addis	r9,r2,L(anchor)@toc@ha
+	addi	r9,r9,L(anchor)@toc@l
+
+	lis	r4,PI_4@h
+	ori	r4,r4,PI_4@l
+
+	xscvdpspn v0,v1
+	mfvsrd	r8,v0
+	rldicl	r3,r8,32,33		/* Remove sign bit.  */
+
+	cmpw	r3,r4
+	bge	L(greater_or_equal_pio4)
+
+	lis	r4,TWO_PN5@h
+	ori	r4,r4,TWO_PN5@l
+
+	cmpw	r3,r4
+	blt	L(less_2pn5)
+
+	/* Chebyshev polynomial of the form:
+	 * 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).  */
+
+	lfd	fp9,(L(C0)-L(anchor))(r9)
+	lfd	fp10,(L(C1)-L(anchor))(r9)
+	lfd	fp11,(L(C2)-L(anchor))(r9)
+	lfd	fp12,(L(C3)-L(anchor))(r9)
+	lfd	fp13,(L(C4)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	lfd     fp3,(L(DPone)-L(anchor))(r9)
+
+	fmadd	fp4,fp2,fp13,fp12	/* C3+x^2*C4 */
+	fmadd	fp4,fp2,fp4,fp11	/* C2+x^2*(C3+x^2*C4) */
+	fmadd	fp4,fp2,fp4,fp10	/* C1+x^2*(C2+x^2*(C3+x^2*C4)) */
+	fmadd	fp4,fp2,fp4,fp9		/* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */
+	fmadd	fp1,fp2,fp4,fp3		/* 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */
+	frsp	fp1,fp1			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(greater_or_equal_pio4):
+	lis	r4,NINEPI_4@h
+	ori	r4,r4,NINEPI_4@l
+	cmpw	r3,r4
+	bge	L(greater_or_equal_9pio4)
+
+	/* Calculate quotient of |x|/(PI/4).  */
+	lfd	fp2,(L(invpio4)-L(anchor))(r9)
+	fabs	fp1,fp1			/* |x| */
+	fmul	fp2,fp1,fp2		/* |x|/(PI/4) */
+	fctiduz	fp2,fp2
+	mfvsrd	r3,v2			/* n = |x| mod PI/4 */
+
+	/* Now use that quotient to find |x| mod (PI/2).  */
+	addi	r7,r3,1
+	rldicr	r5,r7,2,60		/* ((n+1) >> 1) << 3 */
+	addi	r6,r9,(L(pio2_table)-L(anchor))
+	lfdx	fp4,r5,r6
+	fsub	fp1,fp1,fp4
+
+	.balign 16
+L(reduced):
+	/* Now we are in the range -PI/4 to PI/4.  */
+
+	/* Work out if we are in a positive or negative primary interval.  */
+	addi    r7,r7,2
+	rldicl	r4,r7,62,63		/* ((n+3) >> 2) & 1 */
+
+	/* Load a 1.0 or -1.0.  */
+	addi	r5,r9,(L(ones)-L(anchor))
+	sldi	r4,r4,3
+	lfdx	fp0,r4,r5
+
+	/* Are we in the primary interval of sin or cos?  */
+	andi.	r4,r7,0x2
+	bne	L(cos)
+
+	/* Chebyshev polynomial of the form:
+	   x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).  */
+
+	lfd	fp9,(L(S0)-L(anchor))(r9)
+	lfd	fp10,(L(S1)-L(anchor))(r9)
+	lfd	fp11,(L(S2)-L(anchor))(r9)
+	lfd	fp12,(L(S3)-L(anchor))(r9)
+	lfd	fp13,(L(S4)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	fmul	fp3,fp2,fp1		/* x^3 */
+
+	fmadd	fp4,fp2,fp13,fp12	/* S3+x^2*S4 */
+	fmadd	fp4,fp2,fp4,fp11	/* S2+x^2*(S3+x^2*S4) */
+	fmadd	fp4,fp2,fp4,fp10	/* S1+x^2*(S2+x^2*(S3+x^2*S4)) */
+	fmadd	fp4,fp2,fp4,fp9		/* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */
+	fmadd	fp4,fp3,fp4,fp1		/* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */
+	fmul	fp4,fp4,fp0		/* Add in the sign.  */
+	frsp	fp1,fp4			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(cos):
+	/* Chebyshev polynomial of the form:
+	   1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).  */
+
+	lfd	fp9,(L(C0)-L(anchor))(r9)
+	lfd	fp10,(L(C1)-L(anchor))(r9)
+	lfd	fp11,(L(C2)-L(anchor))(r9)
+	lfd	fp12,(L(C3)-L(anchor))(r9)
+	lfd	fp13,(L(C4)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	lfd	fp3,(L(DPone)-L(anchor))(r9)
+
+	fmadd	fp4,fp2,fp13,fp12	/* C3+x^2*C4 */
+	fmadd	fp4,fp2,fp4,fp11	/* C2+x^2*(C3+x^2*C4) */
+	fmadd	fp4,fp2,fp4,fp10	/* C1+x^2*(C2+x^2*(C3+x^2*C4)) */
+	fmadd	fp4,fp2,fp4,fp9		/* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */
+	fmadd	fp4,fp2,fp4,fp3		/* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */
+	fmul	fp4,fp4,fp0		/* Add in the sign.  */
+	frsp	fp1,fp4			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(greater_or_equal_9pio4):
+	lis	r4,INFINITY@h
+	ori	r4,r4,INFINITY@l
+	cmpw	r3,r4
+	bge	L(inf_or_nan)
+
+	lis	r4,TWO_P23@h
+	ori	r4,r4,TWO_P23@l
+	cmpw	r3,r4
+	bge	L(greater_or_equal_2p23)
+
+	fabs	fp1,fp1			/* |x| */
+
+	/* Calculate quotient of |x|/(PI/4).  */
+	lfd	fp2,(L(invpio4)-L(anchor))(r9)
+
+	lfd     fp3,(L(DPone)-L(anchor))(r9)
+	lfd     fp4,(L(DPhalf)-L(anchor))(r9)
+	fmul    fp2,fp1,fp2             /* |x|/(PI/4) */
+	friz    fp2,fp2                 /* n = floor(|x|/(PI/4)) */
+
+	/* Calculate (n + 1) / 2.  */
+	fadd	fp2,fp2,fp3		/* n + 1 */
+	fmul	fp3,fp2,fp4		/* (n + 1) / 2 */
+	friz	fp3,fp3
+
+	lfd	fp4,(L(pio2hi)-L(anchor))(r9)
+	lfd	fp5,(L(pio2lo)-L(anchor))(r9)
+
+	fmul	fp6,fp4,fp3
+	fadd	fp6,fp6,fp1
+	fmadd	fp1,fp5,fp3,fp6
+
+	fctiduz	fp2,fp2
+	mfvsrd	r7,v2			/* n + 1 */
+
+	b	L(reduced)
+
+	.balign 16
+L(inf_or_nan):
+	bne	L(skip_errno_setting)	/* Is a NAN?  */
+
+	/* We delayed the creation of the stack frame, as well as the saving of
+	   the link register, because only at this point, we are sure that
+	   doing so is actually needed.  */
+
+	stfd	fp1,-8(r1)
+
+	/* Save the link register.  */
+	mflr	r0
+	std	r0,16(r1)
+	cfi_offset(lr, 16)
+
+	/* Create the stack frame.  */
+	stdu	r1,-FRAMESIZE(r1)
+	cfi_adjust_cfa_offset(FRAMESIZE)
+
+	bl	JUMPTARGET(__errno_location)
+	nop
+
+	/* Restore the stack frame.  */
+	addi	r1,r1,FRAMESIZE
+	cfi_adjust_cfa_offset(-FRAMESIZE)
+	/* Restore the link register.  */
+	ld	r0,16(r1)
+	mtlr	r0
+
+	lfd	fp1,-8(r1)
+
+	/* errno = EDOM */
+	li	r4,EDOM
+	stw	r4,0(r3)
+
+L(skip_errno_setting):
+	fsub	fp1,fp1,fp1		/* x - x */
+	blr
+
+	.balign 16
+L(greater_or_equal_2p23):
+	fabs	fp1,fp1
+
+	srwi	r4,r3,FLOAT_EXPONENT_SHIFT
+	subi	r4,r4,FLOAT_EXPONENT_BIAS
+
+	/* We reduce the input modulo pi/4, so we need 3 bits of integer
+	   to determine where in 2*pi we are. Index into our array
+	   accordingly.  */
+	addi r4,r4,INTEGER_BITS
+
+	/* To avoid an expensive divide, for the range we care about (0 - 127)
+	   we can transform x/28 into:
+
+	   x/28 = (x * ((0x100000000 / 28) + 1)) >> 32
+
+	   mulhwu returns the top 32 bits of the 64 bit result, doing the
+	   shift for us in the same instruction. The top 32 bits are undefined,
+	   so we have to mask them.  */
+
+	lis	r6,FX_FRACTION_1_28@h
+	ori	r6,r6,FX_FRACTION_1_28@l
+	mulhwu	r5,r4,r6
+	clrldi	r5,r5,32
+
+	/* Get our pointer into the invpio4_table array.  */
+	sldi	r4,r5,3
+	addi	r6,r9,(L(invpio4_table)-L(anchor))
+	add	r4,r4,r6
+
+	lfd	fp2,0(r4)
+	lfd	fp3,8(r4)
+	lfd	fp4,16(r4)
+	lfd	fp5,24(r4)
+
+	fmul	fp6,fp2,fp1
+	fmul	fp7,fp3,fp1
+	fmul	fp8,fp4,fp1
+	fmul	fp9,fp5,fp1
+
+	/* Mask off larger integer bits in highest double word that we don't
+	   care about to avoid losing precision when combining with smaller
+	   values.  */
+	fctiduz	fp10,fp6
+	mfvsrd	r7,v10
+	rldicr	r7,r7,0,(63-INTEGER_BITS)
+	mtvsrd	v10,r7
+	fcfidu	fp10,fp10		/* Integer bits.  */
+
+	fsub	fp6,fp6,fp10		/* highest -= integer bits */
+
+	/* Work out the integer component, rounded down. Use the top two
+	   limbs for this.  */
+	fadd	fp10,fp6,fp7		/* highest + higher */
+
+	fctiduz	fp10,fp10
+	mfvsrd	r7,v10
+	andi.	r0,r7,1
+	fcfidu	fp10,fp10
+
+	/* Subtract integer component from highest limb.  */
+	fsub	fp12,fp6,fp10
+
+	beq	L(even_integer)
+
+	/* Our integer component is odd, so we are in the -PI/4 to 0 primary
+	   region. We need to shift our result down by PI/4, and to do this
+	   in the mod (4/PI) space we simply subtract 1.  */
+	lfd	fp11,(L(DPone)-L(anchor))(r9)
+	fsub	fp12,fp12,fp11
+
+	/* Now add up all the limbs in order.  */
+	fadd	fp12,fp12,fp7
+	fadd	fp12,fp12,fp8
+	fadd	fp12,fp12,fp9
+
+	/* And finally multiply by pi/4.  */
+	lfd	fp13,(L(pio4)-L(anchor))(r9)
+	fmul	fp1,fp12,fp13
+
+	addi	r7,r7,1
+	b	L(reduced)
+
+L(even_integer):
+	lfd	fp11,(L(DPone)-L(anchor))(r9)
+
+	/* Now add up all the limbs in order.  */
+	fadd	fp12,fp12,fp7
+	fadd	fp12,r12,fp8
+	fadd	fp12,r12,fp9
+
+	/* We need to check if the addition of all the limbs resulted in us
+	   overflowing 1.0.  */
+	fcmpu	0,fp12,fp11
+	bgt	L(greater_than_one)
+
+	/* And finally multiply by pi/4.  */
+	lfd	fp13,(L(pio4)-L(anchor))(r9)
+	fmul	fp1,fp12,fp13
+
+	addi	r7,r7,1
+	b	L(reduced)
+
+L(greater_than_one):
+	/* We did overflow 1.0 when adding up all the limbs. Add 1.0 to our
+	   integer, and subtract 1.0 from our result. Since that makes the
+	   integer component odd, we need to subtract another 1.0 as
+	   explained above.  */
+	addi	r7,r7,1
+
+	lfd	fp11,(L(DPtwo)-L(anchor))(r9)
+	fsub	fp12,fp12,fp11
+
+	/* And finally multiply by pi/4.  */
+	lfd	fp13,(L(pio4)-L(anchor))(r9)
+	fmul	fp1,fp12,fp13
+
+	addi	r7,r7,1
+	b	L(reduced)
+
+	.balign 16
+L(less_2pn5):
+	lis	r4,TWO_PN27@h
+	ori	r4,r4,TWO_PN27@l
+
+	cmpw	r3,r4
+	blt	L(less_2pn27)
+
+	/* A simpler Chebyshev approximation is close enough for this range:
+	   1.0+x^2*(CC0+x^3*CC1).  */
+
+	lfd	fp10,(L(CC0)-L(anchor))(r9)
+	lfd	fp11,(L(CC1)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	fmul	fp3,fp2,fp1		/* x^3 */
+	lfd     fp1,(L(DPone)-L(anchor))(r9)
+
+	fmadd   fp4,fp3,fp11,fp10       /* CC0+x^3*CC1 */
+	fmadd	fp1,fp2,fp4,fp1		/* 1.0+x^2*(CC0+x^3*CC1) */
+
+	frsp	fp1,fp1			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(less_2pn27):
+	/* Handle some special cases:
+
+	   cosf(subnormal) raises inexact
+	   cosf(min_normalized) raises inexact
+	   cosf(normalized) raises inexact.  */
+
+	lfd     fp2,(L(DPone)-L(anchor))(r9)
+
+	fabs    fp1,fp1                 /* |x| */
+	fsub	fp1,fp2,fp1		/* 1.0-|x| */
+
+	frsp	fp1,fp1
+
+	blr
+
+END (__cosf)
+
+	.section .rodata, "a"
+
+	.balign 8
+
+L(anchor):
+
+	/* Chebyshev constants for sin, range -PI/4 - PI/4.  */
+L(S0):	.8byte	0xbfc5555555551cd9
+L(S1):	.8byte	0x3f81111110c2688b
+L(S2):	.8byte	0xbf2a019f8b4bd1f9
+L(S3):	.8byte	0x3ec71d7264e6b5b4
+L(S4):	.8byte	0xbe5a947e1674b58a
+
+	/* Chebyshev constants for cos, range 2^-27 - 2^-5.  */
+L(CC0):	.8byte	0xbfdfffffff5cc6fd
+L(CC1):	.8byte	0x3fa55514b178dac5
+
+	/* Chebyshev constants for cos, range -PI/4 - PI/4.  */
+L(C0):	.8byte	0xbfdffffffffe98ae
+L(C1):	.8byte	0x3fa55555545c50c7
+L(C2):	.8byte	0xbf56c16b348b6874
+L(C3):	.8byte	0x3efa00eb9ac43cc0
+L(C4):	.8byte	0xbe923c97dd8844d7
+
+L(invpio2):
+	.8byte	0x3fe45f306dc9c883	/* 2/PI */
+
+L(invpio4):
+	.8byte	0x3ff45f306dc9c883	/* 4/PI */
+
+L(invpio4_table):
+	.8byte	0x0000000000000000
+	.8byte	0x3ff45f306c000000
+	.8byte	0x3e3c9c882a000000
+	.8byte	0x3c54fe13a8000000
+	.8byte	0x3aaf47d4d0000000
+	.8byte	0x38fbb81b6c000000
+	.8byte	0x3714acc9e0000000
+	.8byte	0x3560e4107c000000
+	.8byte	0x33bca2c756000000
+	.8byte	0x31fbd778ac000000
+	.8byte	0x300b7246e0000000
+	.8byte	0x2e5d2126e8000000
+	.8byte	0x2c97003248000000
+	.8byte	0x2ad77504e8000000
+	.8byte	0x290921cfe0000000
+	.8byte	0x274deb1cb0000000
+	.8byte	0x25829a73e0000000
+	.8byte	0x23fd1046be000000
+	.8byte	0x2224baed10000000
+	.8byte	0x20709d338e000000
+	.8byte	0x1e535a2f80000000
+	.8byte	0x1cef904e64000000
+	.8byte	0x1b0d639830000000
+	.8byte	0x1964ce7d24000000
+	.8byte	0x17b908bf16000000
+
+L(pio4):
+	.8byte	0x3fe921fb54442d18	/* PI/4 */
+
+/* PI/2 as a sum of two doubles. We only use 32 bits of the upper limb
+   to avoid losing significant bits when multiplying with up to
+   (2^22)/(pi/2).  */
+L(pio2hi):
+	.8byte	0xbff921fb54400000
+
+L(pio2lo):
+	.8byte	0xbdd0b4611a626332
+
+L(pio2_table):
+	.8byte	0
+	.8byte	0x3ff921fb54442d18	/* 1 * PI/2 */
+	.8byte	0x400921fb54442d18	/* 2 * PI/2 */
+	.8byte	0x4012d97c7f3321d2	/* 3 * PI/2 */
+	.8byte	0x401921fb54442d18	/* 4 * PI/2 */
+	.8byte	0x401f6a7a2955385e	/* 5 * PI/2 */
+	.8byte	0x4022d97c7f3321d2	/* 6 * PI/2 */
+	.8byte	0x4025fdbbe9bba775	/* 7 * PI/2 */
+	.8byte	0x402921fb54442d18	/* 8 * PI/2 */
+	.8byte	0x402c463abeccb2bb	/* 9 * PI/2 */
+	.8byte	0x402f6a7a2955385e	/* 10 * PI/2 */
+
+L(small):
+	.8byte	0x3cd0000000000000	/* 2^-50 */
+
+L(ones):
+	.8byte	0x3ff0000000000000	/* +1.0 */
+	.8byte	0xbff0000000000000	/* -1.0 */
+
+L(DPhalf):
+	.8byte	0x3fe0000000000000	/* 0.5 */
+
+L(DPone):
+	.8byte	0x3ff0000000000000	/* 1.0 */
+
+L(DPtwo):
+	.8byte	0x4000000000000000	/* 2.0 */
+
+weak_alias(__cosf, cosf)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
new file mode 100644
index 0000000000..fcdcb60293
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
@@ -0,0 +1,56 @@
+/* isfinite().  PowerPC64/POWER8 version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+#define MFVSRD_R3_V1  .long 0x7c230066     /* mfvsrd  r3,vs1  */
+
+/* int [r3] __finite ([fp1] x)  */
+
+EALIGN (__finite, 4, 0)
+	CALL_MCOUNT 0
+	MFVSRD_R3_V1
+	lis     r9,0x8010
+	clrldi  r3,r3,1       /* r3 = r3 & 0x8000000000000000  */
+	rldicr  r9,r9,32,31   /* r9 = (r9 << 32) & 0xffffffff  */
+	add     r3,r3,r9
+	rldicl  r3,r3,1,63
+	blr
+END (__finite)
+
+hidden_def (__finite)
+weak_alias (__finite, finite)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__finite, __finitef)
+hidden_def (__finitef)
+weak_alias (__finitef, finitef)
+
+#if IS_IN (libm)
+# if LONG_DOUBLE_COMPAT (libm, GLIBC_2_0)
+compat_symbol (libm, __finite, __finitel, GLIBC_2_0)
+compat_symbol (libm, finite, finitel, GLIBC_2_0)
+# endif
+#else
+# if LONG_DOUBLE_COMPAT (libc, GLIBC_2_0)
+compat_symbol (libc, __finite, __finitel, GLIBC_2_0);
+compat_symbol (libc, finite, finitel, GLIBC_2_0);
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_finitef.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_finitef.S
new file mode 100644
index 0000000000..54bd94176d
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_finitef.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_finite.S.  */
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
new file mode 100644
index 0000000000..32814e4525
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
@@ -0,0 +1,61 @@
+/* isinf().  PowerPC64/POWER8 version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+#define MFVSRD_R3_V1  .long 0x7c230066     /* mfvsrd  r3,vs1  */
+
+/* int [r3] __isinf([fp1] x)  */
+
+EALIGN (__isinf, 4, 0)
+	CALL_MCOUNT 0
+	MFVSRD_R3_V1
+	lis     r9,0x7ff0     /* r9 = 0x7ff0  */
+	rldicl  r10,r3,0,1    /* r10 = r3 & (0x8000000000000000)  */
+	sldi    r9,r9,32      /* r9 = r9 << 52  */
+	cmpd    cr7,r10,r9    /* fp1 & 0x7ff0000000000000 ?  */
+	beq     cr7,L(inf)
+	li      r3,0          /* Not inf  */
+	blr
+L(inf):
+	sradi   r3,r3,63      /* r3 = r3 >> 63  */
+	ori     r3,r3,1       /* r3 = r3 | 0x1  */
+	blr
+END (__isinf)
+
+hidden_def (__isinf)
+weak_alias (__isinf, isinf)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__isinf, __isinff)
+hidden_def (__isinff)
+weak_alias (__isinff, isinff)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isinf, __isinfl)
+weak_alias (__isinf, isinfl)
+#endif
+
+#if !IS_IN (libm)
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0);
+compat_symbol (libc, isinf, isinfl, GLIBC_2_0);
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isinff.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isinff.S
new file mode 100644
index 0000000000..be759e091e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isinff.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isinf.S.  */
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
new file mode 100644
index 0000000000..af52e502b7
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
@@ -0,0 +1,56 @@
+/* isnan().  PowerPC64/POWER8 version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+#define MFVSRD_R3_V1  .long 0x7c230066     /* mfvsrd  r3,vs1  */
+
+/* int [r3] __isnan([f1] x)  */
+
+EALIGN (__isnan, 4, 0)
+	CALL_MCOUNT 0
+	MFVSRD_R3_V1
+	lis     r9,0x7ff0
+	clrldi  r3,r3,1       /* r3 = r3 & 0x8000000000000000  */
+	rldicr  r9,r9,32,31   /* r9 = (r9 << 32) & 0xffffffff  */
+	subf    r3,r3,r9
+	rldicl  r3,r3,1,63
+	blr
+END (__isnan)
+
+hidden_def (__isnan)
+weak_alias (__isnan, isnan)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__isnan, __isnanf)
+hidden_def (__isnanf)
+weak_alias (__isnanf, isnanf)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isnan, __isnanl)
+weak_alias (__isnan, isnanl)
+#endif
+
+#if !IS_IN (libm)
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0);
+compat_symbol (libc, isnan, isnanl, GLIBC_2_0);
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isnanf.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isnanf.S
new file mode 100644
index 0000000000..b48c85e0d3
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isnanf.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isnan.S.  */
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
new file mode 100644
index 0000000000..aa180b6901
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
@@ -0,0 +1,45 @@
+/* Round double to long int.  POWER8 PowerPC64 version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+#define MFVSRD_R3_V1  .long 0x7c230066     /* mfvsrd  r3,vs1  */
+
+/* long long int[r3] __llrint (double x[fp1])  */
+ENTRY (__llrint)
+	CALL_MCOUNT 0
+	fctid	fp1,fp1
+	MFVSRD_R3_V1
+	blr
+END (__llrint)
+
+strong_alias (__llrint, __lrint)
+weak_alias (__llrint, llrint)
+weak_alias (__lrint, lrint)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__llrint, __llrintl)
+weak_alias (__llrint, llrintl)
+strong_alias (__lrint, __lrintl)
+weak_alias (__lrint, lrintl)
+#endif
+#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_1)
+compat_symbol (libm, __llrint, llrintl, GLIBC_2_1)
+compat_symbol (libm, __lrint, lrintl, GLIBC_2_1)
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
new file mode 100644
index 0000000000..043fc6a089
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
@@ -0,0 +1,48 @@
+/* llround function.  POWER8 PowerPC64 version.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <endian.h>
+#include <math_ldbl_opt.h>
+
+#define MFVSRD_R3_V1  .long 0x7c230066     /* mfvsrd  r3,vs1  */
+
+/* long long [r3] llround (float x [fp1])  */
+
+ENTRY (__llround)
+	CALL_MCOUNT 0
+	frin	fp1,fp1	/* Round to nearest +-0.5.  */
+	fctidz	fp1,fp1	/* Convert To Integer DW round toward 0.  */
+	MFVSRD_R3_V1
+	blr
+END (__llround)
+
+strong_alias (__llround, __lround)
+weak_alias (__llround, llround)
+weak_alias (__lround, lround)
+
+#ifdef NO_LONG_DOUBLE
+weak_alias (__llround, llroundl)
+strong_alias (__llround, __llroundl)
+weak_alias (__lround, lroundl)
+strong_alias (__lround, __lroundl)
+#endif
+#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_1)
+compat_symbol (libm, __llround, llroundl, GLIBC_2_1)
+compat_symbol (libm, __lround, lroundl, GLIBC_2_1)
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S
new file mode 100644
index 0000000000..fb0add3462
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S
@@ -0,0 +1,519 @@
+/* Optimized sinf().  PowerPC64/POWER8 version.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#define _ERRNO_H	1
+#include <bits/errno.h>
+
+#define FRAMESIZE (FRAME_MIN_SIZE+16)
+
+#define FLOAT_EXPONENT_SHIFT	23
+#define FLOAT_EXPONENT_BIAS	127
+#define INTEGER_BITS		3
+
+#define PI_4		0x3f490fdb	/* PI/4 */
+#define NINEPI_4	0x40e231d6	/* 9 * PI/4 */
+#define TWO_PN5		0x3d000000	/* 2^-5 */
+#define TWO_PN27	0x32000000	/* 2^-27 */
+#define INFINITY	0x7f800000
+#define TWO_P23		0x4b000000	/* 2^27 */
+#define FX_FRACTION_1_28 0x9249250	/* 0x100000000 / 28 + 1 */
+
+	/* Implements the function
+
+	   float [fp1] sinf (float [fp1] x)  */
+
+	.machine power8
+EALIGN(__sinf, 4, 0)
+	addis	r9,r2,L(anchor)@toc@ha
+	addi	r9,r9,L(anchor)@toc@l
+
+	lis	r4,PI_4@h
+	ori	r4,r4,PI_4@l
+
+	xscvdpspn v0,v1
+	mfvsrd	r8,v0
+	rldicl	r3,r8,32,33		/* Remove sign bit.  */
+
+	cmpw	r3,r4
+	bge	L(greater_or_equal_pio4)
+
+	lis	r4,TWO_PN5@h
+	ori	r4,r4,TWO_PN5@l
+
+	cmpw	r3,r4
+	blt	L(less_2pn5)
+
+	/* Chebyshev polynomial of the form:
+	 * x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).  */
+
+	lfd	fp9,(L(S0)-L(anchor))(r9)
+	lfd	fp10,(L(S1)-L(anchor))(r9)
+	lfd	fp11,(L(S2)-L(anchor))(r9)
+	lfd	fp12,(L(S3)-L(anchor))(r9)
+	lfd	fp13,(L(S4)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	fmul	fp3,fp2,fp1		/* x^3 */
+
+	fmadd	fp4,fp2,fp13,fp12	/* S3+x^2*S4 */
+	fmadd	fp4,fp2,fp4,fp11	/* S2+x^2*(S3+x^2*S4) */
+	fmadd	fp4,fp2,fp4,fp10	/* S1+x^2*(S2+x^2*(S3+x^2*S4)) */
+	fmadd	fp4,fp2,fp4,fp9		/* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */
+	fmadd	fp1,fp3,fp4,fp1		/* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */
+	frsp	fp1,fp1			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(greater_or_equal_pio4):
+	lis	r4,NINEPI_4@h
+	ori	r4,r4,NINEPI_4@l
+	cmpw	r3,r4
+	bge	L(greater_or_equal_9pio4)
+
+	/* Calculate quotient of |x|/(PI/4).  */
+	lfd	fp2,(L(invpio4)-L(anchor))(r9)
+	fabs	fp1,fp1			/* |x| */
+	fmul	fp2,fp1,fp2		/* |x|/(PI/4) */
+	fctiduz	fp2,fp2
+	mfvsrd	r3,v2			/* n = |x| mod PI/4 */
+
+	/* Now use that quotient to find |x| mod (PI/2).  */
+	addi	r7,r3,1
+	rldicr	r5,r7,2,60		/* ((n+1) >> 1) << 3 */
+	addi	r6,r9,(L(pio2_table)-L(anchor))
+	lfdx	fp4,r5,r6
+	fsub	fp1,fp1,fp4
+
+	.balign 16
+L(reduced):
+	/* Now we are in the range -PI/4 to PI/4.  */
+
+	/* Work out if we are in a positive or negative primary interval.  */
+	rldicl	r4,r7,62,63		/* ((n+1) >> 2) & 1 */
+
+	/* We are operating on |x|, so we need to add back the original
+	   sign.  */
+	rldicl	r8,r8,33,63		/* (x >> 31) & 1, ie the sign bit.  */
+	xor	r4,r4,r8		/* 0 if result should be positive,
+					   1 if negative.  */
+
+	/* Load a 1.0 or -1.0.  */
+	addi	r5,r9,(L(ones)-L(anchor))
+	sldi	r4,r4,3
+	lfdx	fp0,r4,r5
+
+	/* Are we in the primary interval of sin or cos?  */
+	andi.	r4,r7,0x2
+	bne	L(cos)
+
+	/* Chebyshev polynomial of the form:
+	   x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).  */
+
+	lfd	fp9,(L(S0)-L(anchor))(r9)
+	lfd	fp10,(L(S1)-L(anchor))(r9)
+	lfd	fp11,(L(S2)-L(anchor))(r9)
+	lfd	fp12,(L(S3)-L(anchor))(r9)
+	lfd	fp13,(L(S4)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	fmul	fp3,fp2,fp1		/* x^3 */
+
+	fmadd	fp4,fp2,fp13,fp12	/* S3+x^2*S4 */
+	fmadd	fp4,fp2,fp4,fp11	/* S2+x^2*(S3+x^2*S4) */
+	fmadd	fp4,fp2,fp4,fp10	/* S1+x^2*(S2+x^2*(S3+x^2*S4)) */
+	fmadd	fp4,fp2,fp4,fp9		/* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */
+	fmadd	fp4,fp3,fp4,fp1		/* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */
+	fmul	fp4,fp4,fp0		/* Add in the sign.  */
+	frsp	fp1,fp4			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(cos):
+	/* Chebyshev polynomial of the form:
+	   1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).  */
+
+	lfd	fp9,(L(C0)-L(anchor))(r9)
+	lfd	fp10,(L(C1)-L(anchor))(r9)
+	lfd	fp11,(L(C2)-L(anchor))(r9)
+	lfd	fp12,(L(C3)-L(anchor))(r9)
+	lfd	fp13,(L(C4)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	lfd	fp3,(L(DPone)-L(anchor))(r9)
+
+	fmadd	fp4,fp2,fp13,fp12	/* C3+x^2*C4 */
+	fmadd	fp4,fp2,fp4,fp11	/* C2+x^2*(C3+x^2*C4) */
+	fmadd	fp4,fp2,fp4,fp10	/* C1+x^2*(C2+x^2*(C3+x^2*C4)) */
+	fmadd	fp4,fp2,fp4,fp9		/* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */
+	fmadd	fp4,fp2,fp4,fp3		/* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */
+	fmul	fp4,fp4,fp0		/* Add in the sign.  */
+	frsp	fp1,fp4			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(greater_or_equal_9pio4):
+	lis	r4,INFINITY@h
+	ori	r4,r4,INFINITY@l
+	cmpw	r3,r4
+	bge	L(inf_or_nan)
+
+	lis	r4,TWO_P23@h
+	ori	r4,r4,TWO_P23@l
+	cmpw	r3,r4
+	bge	L(greater_or_equal_2p23)
+
+	fabs	fp1,fp1			/* |x| */
+
+	/* Calculate quotient of |x|/(PI/4).  */
+	lfd	fp2,(L(invpio4)-L(anchor))(r9)
+
+	lfd	fp3,(L(DPone)-L(anchor))(r9)
+	lfd	fp4,(L(DPhalf)-L(anchor))(r9)
+	fmul	fp2,fp1,fp2		/* |x|/(PI/4) */
+	friz	fp2,fp2			/* n = floor(|x|/(PI/4)) */
+
+	/* Calculate (n + 1) / 2.  */
+	fadd	fp2,fp2,fp3		/* n + 1 */
+	fmul	fp3,fp2,fp4		/* (n + 1) / 2 */
+	friz	fp3,fp3
+
+	lfd	fp4,(L(pio2hi)-L(anchor))(r9)
+	lfd	fp5,(L(pio2lo)-L(anchor))(r9)
+
+	fmul	fp6,fp4,fp3
+	fadd	fp6,fp6,fp1
+	fmadd	fp1,fp5,fp3,fp6
+
+	fctiduz	fp2,fp2
+	mfvsrd	r7,v2			/* n + 1 */
+
+	b	L(reduced)
+
+	.balign 16
+L(inf_or_nan):
+	bne	L(skip_errno_setting)	/* Is a NAN?  */
+
+	/* We delayed the creation of the stack frame, as well as the saving of
+	   the link register, because only at this point, we are sure that
+	   doing so is actually needed.  */
+
+	stfd	fp1,-8(r1)
+
+	/* Save the link register.  */
+	mflr	r0
+	std	r0,16(r1)
+	cfi_offset(lr, 16)
+
+	/* Create the stack frame.  */
+	stdu	r1,-FRAMESIZE(r1)
+	cfi_adjust_cfa_offset(FRAMESIZE)
+
+	bl	JUMPTARGET(__errno_location)
+	nop
+
+	/* Restore the stack frame.  */
+	addi	r1,r1,FRAMESIZE
+	cfi_adjust_cfa_offset(-FRAMESIZE)
+	/* Restore the link register.  */
+	ld	r0,16(r1)
+	mtlr	r0
+
+	lfd	fp1,-8(r1)
+
+	/* errno = EDOM */
+	li	r4,EDOM
+	stw	r4,0(r3)
+
+L(skip_errno_setting):
+	fsub	fp1,fp1,fp1		/* x - x */
+	blr
+
+	.balign 16
+L(greater_or_equal_2p23):
+	fabs	fp1,fp1
+
+	srwi	r4,r3,FLOAT_EXPONENT_SHIFT
+	subi	r4,r4,FLOAT_EXPONENT_BIAS
+
+	/* We reduce the input modulo pi/4, so we need 3 bits of integer
+	   to determine where in 2*pi we are. Index into our array
+	   accordingly.  */
+	addi r4,r4,INTEGER_BITS
+
+	/* To avoid an expensive divide, for the range we care about (0 - 127)
+	   we can transform x/28 into:
+
+	   x/28 = (x * ((0x100000000 / 28) + 1)) >> 32
+
+	   mulhwu returns the top 32 bits of the 64 bit result, doing the
+	   shift for us in the same instruction. The top 32 bits are undefined,
+	   so we have to mask them.  */
+
+	lis	r6,FX_FRACTION_1_28@h
+	ori	r6,r6,FX_FRACTION_1_28@l
+	mulhwu	r5,r4,r6
+	clrldi	r5,r5,32
+
+	/* Get our pointer into the invpio4_table array.  */
+	sldi	r4,r5,3
+	addi	r6,r9,(L(invpio4_table)-L(anchor))
+	add	r4,r4,r6
+
+	lfd	fp2,0(r4)
+	lfd	fp3,8(r4)
+	lfd	fp4,16(r4)
+	lfd	fp5,24(r4)
+
+	fmul	fp6,fp2,fp1
+	fmul	fp7,fp3,fp1
+	fmul	fp8,fp4,fp1
+	fmul	fp9,fp5,fp1
+
+	/* Mask off larger integer bits in highest double word that we don't
+	   care about to avoid losing precision when combining with smaller
+	   values.  */
+	fctiduz	fp10,fp6
+	mfvsrd	r7,v10
+	rldicr	r7,r7,0,(63-INTEGER_BITS)
+	mtvsrd	v10,r7
+	fcfidu	fp10,fp10		/* Integer bits.  */
+
+	fsub	fp6,fp6,fp10		/* highest -= integer bits */
+
+	/* Work out the integer component, rounded down. Use the top two
+	   limbs for this.  */
+	fadd	fp10,fp6,fp7		/* highest + higher */
+
+	fctiduz	fp10,fp10
+	mfvsrd	r7,v10
+	andi.	r0,r7,1
+	fcfidu	fp10,fp10
+
+	/* Subtract integer component from highest limb.  */
+	fsub	fp12,fp6,fp10
+
+	beq	L(even_integer)
+
+	/* Our integer component is odd, so we are in the -PI/4 to 0 primary
+	   region. We need to shift our result down by PI/4, and to do this
+	   in the mod (4/PI) space we simply subtract 1.  */
+	lfd	fp11,(L(DPone)-L(anchor))(r9)
+	fsub	fp12,fp12,fp11
+
+	/* Now add up all the limbs in order.  */
+	fadd	fp12,fp12,fp7
+	fadd	fp12,fp12,fp8
+	fadd	fp12,fp12,fp9
+
+	/* And finally multiply by pi/4.  */
+	lfd	fp13,(L(pio4)-L(anchor))(r9)
+	fmul	fp1,fp12,fp13
+
+	addi	r7,r7,1
+	b	L(reduced)
+
+L(even_integer):
+	lfd	fp11,(L(DPone)-L(anchor))(r9)
+
+	/* Now add up all the limbs in order.  */
+	fadd	fp12,fp12,fp7
+	fadd	fp12,r12,fp8
+	fadd	fp12,r12,fp9
+
+	/* We need to check if the addition of all the limbs resulted in us
+	   overflowing 1.0.  */
+	fcmpu	0,fp12,fp11
+	bgt	L(greater_than_one)
+
+	/* And finally multiply by pi/4.  */
+	lfd	fp13,(L(pio4)-L(anchor))(r9)
+	fmul	fp1,fp12,fp13
+
+	addi	r7,r7,1
+	b	L(reduced)
+
+L(greater_than_one):
+	/* We did overflow 1.0 when adding up all the limbs. Add 1.0 to our
+	   integer, and subtract 1.0 from our result. Since that makes the
+	   integer component odd, we need to subtract another 1.0 as
+	   explained above.  */
+	addi	r7,r7,1
+
+	lfd	fp11,(L(DPtwo)-L(anchor))(r9)
+	fsub	fp12,fp12,fp11
+
+	/* And finally multiply by pi/4.  */
+	lfd	fp13,(L(pio4)-L(anchor))(r9)
+	fmul	fp1,fp12,fp13
+
+	addi	r7,r7,1
+	b	L(reduced)
+
+	.balign 16
+L(less_2pn5):
+	lis	r4,TWO_PN27@h
+	ori	r4,r4,TWO_PN27@l
+
+	cmpw	r3,r4
+	blt	L(less_2pn27)
+
+	/* A simpler Chebyshev approximation is close enough for this range:
+	   x+x^3*(SS0+x^2*SS1).  */
+
+	lfd	fp10,(L(SS0)-L(anchor))(r9)
+	lfd	fp11,(L(SS1)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	fmul	fp3,fp2,fp1		/* x^3 */
+
+	fmadd	fp4,fp2,fp11,fp10	/* SS0+x^2*SS1 */
+	fmadd	fp1,fp3,fp4,fp1		/* x+x^3*(SS0+x^2*SS1) */
+
+	frsp	fp1,fp1			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(less_2pn27):
+	cmpwi	r3,0
+	beq	L(zero)
+
+	/* Handle some special cases:
+
+	   sinf(subnormal) raises inexact/underflow
+	   sinf(min_normalized) raises inexact/underflow
+	   sinf(normalized) raises inexact.  */
+
+	lfd	fp2,(L(small)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp2		/* x * small */
+	fsub	fp1,fp1,fp2		/* x - x * small */
+
+	frsp	fp1,fp1
+
+	blr
+
+	.balign 16
+L(zero):
+	blr
+
+END (__sinf)
+
+	.section .rodata, "a"
+
+	.balign 8
+
+L(anchor):
+
+	/* Chebyshev constants for sin, range -PI/4 - PI/4.  */
+L(S0):	.8byte	0xbfc5555555551cd9
+L(S1):	.8byte	0x3f81111110c2688b
+L(S2):	.8byte	0xbf2a019f8b4bd1f9
+L(S3):	.8byte	0x3ec71d7264e6b5b4
+L(S4):	.8byte	0xbe5a947e1674b58a
+
+	/* Chebyshev constants for sin, range 2^-27 - 2^-5.  */
+L(SS0):	.8byte	0xbfc555555543d49d
+L(SS1):	.8byte	0x3f8110f475cec8c5
+
+	/* Chebyshev constants for cos, range -PI/4 - PI/4.  */
+L(C0):	.8byte	0xbfdffffffffe98ae
+L(C1):	.8byte	0x3fa55555545c50c7
+L(C2):	.8byte	0xbf56c16b348b6874
+L(C3):	.8byte	0x3efa00eb9ac43cc0
+L(C4):	.8byte	0xbe923c97dd8844d7
+
+L(invpio2):
+	.8byte	0x3fe45f306dc9c883	/* 2/PI */
+
+L(invpio4):
+	.8byte	0x3ff45f306dc9c883	/* 4/PI */
+
+L(invpio4_table):
+	.8byte	0x0000000000000000
+	.8byte	0x3ff45f306c000000
+	.8byte	0x3e3c9c882a000000
+	.8byte	0x3c54fe13a8000000
+	.8byte	0x3aaf47d4d0000000
+	.8byte	0x38fbb81b6c000000
+	.8byte	0x3714acc9e0000000
+	.8byte	0x3560e4107c000000
+	.8byte	0x33bca2c756000000
+	.8byte	0x31fbd778ac000000
+	.8byte	0x300b7246e0000000
+	.8byte	0x2e5d2126e8000000
+	.8byte	0x2c97003248000000
+	.8byte	0x2ad77504e8000000
+	.8byte	0x290921cfe0000000
+	.8byte	0x274deb1cb0000000
+	.8byte	0x25829a73e0000000
+	.8byte	0x23fd1046be000000
+	.8byte	0x2224baed10000000
+	.8byte	0x20709d338e000000
+	.8byte	0x1e535a2f80000000
+	.8byte	0x1cef904e64000000
+	.8byte	0x1b0d639830000000
+	.8byte	0x1964ce7d24000000
+	.8byte	0x17b908bf16000000
+
+L(pio4):
+	.8byte	0x3fe921fb54442d18	/* PI/4 */
+
+/* PI/2 as a sum of two doubles. We only use 32 bits of the upper limb
+   to avoid losing significant bits when multiplying with up to
+   (2^22)/(pi/2).  */
+L(pio2hi):
+	.8byte	0xbff921fb54400000
+
+L(pio2lo):
+	.8byte	0xbdd0b4611a626332
+
+L(pio2_table):
+	.8byte	0
+	.8byte	0x3ff921fb54442d18	/* 1 * PI/2 */
+	.8byte	0x400921fb54442d18	/* 2 * PI/2 */
+	.8byte	0x4012d97c7f3321d2	/* 3 * PI/2 */
+	.8byte	0x401921fb54442d18	/* 4 * PI/2 */
+	.8byte	0x401f6a7a2955385e	/* 5 * PI/2 */
+	.8byte	0x4022d97c7f3321d2	/* 6 * PI/2 */
+	.8byte	0x4025fdbbe9bba775	/* 7 * PI/2 */
+	.8byte	0x402921fb54442d18	/* 8 * PI/2 */
+	.8byte	0x402c463abeccb2bb	/* 9 * PI/2 */
+	.8byte	0x402f6a7a2955385e	/* 10 * PI/2 */
+
+L(small):
+	.8byte	0x3cd0000000000000	/* 2^-50 */
+
+L(ones):
+	.8byte	0x3ff0000000000000	/* +1.0 */
+	.8byte	0xbff0000000000000	/* -1.0 */
+
+L(DPhalf):
+	.8byte	0x3fe0000000000000	/* 0.5 */
+
+L(DPone):
+	.8byte	0x3ff0000000000000	/* 1.0 */
+
+L(DPtwo):
+	.8byte	0x4000000000000000	/* 2.0 */
+
+weak_alias(__sinf, sinf)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memcmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memcmp.S
new file mode 100644
index 0000000000..46b9c0067a
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memcmp.S
@@ -0,0 +1,1447 @@
+/* Optimized memcmp implementation for POWER7/PowerPC64.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] memcmp (const char *s1 [r3],
+		    const char *s2 [r4],
+		    size_t size [r5])  */
+
+/* TODO: change these to the actual instructions when the minimum required
+   binutils allows it.  */
+#define MFVRD(r,v)	.long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#ifndef MEMCMP
+# define MEMCMP memcmp
+#endif
+	.machine power7
+EALIGN (MEMCMP, 4, 0)
+	CALL_MCOUNT 3
+
+#define rRTN		r3
+#define rSTR1		r3	/* First string arg.  */
+#define rSTR2		r4	/* Second string arg.  */
+#define rN		r5	/* Max string length.  */
+#define rWORD1		r6	/* Current word in s1.  */
+#define rWORD2		r7	/* Current word in s2.  */
+#define rWORD3		r8	/* Next word in s1.  */
+#define rWORD4		r9	/* Next word in s2.  */
+#define rWORD5		r10	/* Next word in s1.  */
+#define rWORD6		r11	/* Next word in s2.  */
+
+#define rOFF8		r20	/* 8 bytes offset.  */
+#define rOFF16  	r21	/* 16 bytes offset.  */
+#define rOFF24		r22	/* 24 bytes offset.  */
+#define rOFF32		r23	/* 24 bytes offset.  */
+#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
+#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
+#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
+#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
+#define rSHR		r28	/* Unaligned shift right count.  */
+#define rSHL		r29	/* Unaligned shift left count.  */
+#define rWORD7		r30	/* Next word in s1.  */
+#define rWORD8		r31	/* Next word in s2.  */
+
+#define rWORD8SAVE	(-8)
+#define rWORD7SAVE	(-16)
+#define rOFF8SAVE	(-24)
+#define rOFF16SAVE	(-32)
+#define rOFF24SAVE	(-40)
+#define rOFF32SAVE	(-48)
+#define rSHRSAVE	(-56)
+#define rSHLSAVE	(-64)
+#define rWORD8SHIFTSAVE	(-72)
+#define rWORD2SHIFTSAVE	(-80)
+#define rWORD4SHIFTSAVE	(-88)
+#define rWORD6SHIFTSAVE	(-96)
+
+#ifdef __LITTLE_ENDIAN__
+# define LD	ldbrx
+#else
+# define LD	ldx
+#endif
+
+	xor	r10, rSTR2, rSTR1
+	cmpldi	cr6, rN, 0
+	cmpldi	cr1, rN, 8
+	clrldi.	r0, r10, 61
+	clrldi	r12, rSTR1, 61
+	cmpldi	cr5, r12, 0
+	beq-	cr6, L(zeroLength)
+	dcbt	0, rSTR1
+	dcbt	0, rSTR2
+	/* If less than 8 bytes or not aligned, use the unaligned
+	   byte loop.  */
+	blt	cr1, L(bytealigned)
+	bne	L(unalignedqw)
+/* At this point we know both strings have the same alignment and the
+   compare length is at least 8 bytes.  r12 contains the low order
+   3 bits of rSTR1 and cr5 contains the result of the logical compare
+   of r12 to 0.  If r12 == 0 then we are already double word
+   aligned and can perform the DW aligned loop.  */
+
+	.align	4
+L(samealignment):
+	or	r11, rSTR2, rSTR1
+	clrldi.	r11, r11, 60
+	beq	L(qw_align)
+	/* Try to align to QW else proceed to DW loop.  */
+	clrldi.	r10, r10, 60
+	bne	L(DW)
+	/* For the difference to reach QW alignment, load as DW.  */
+	clrrdi	rSTR1, rSTR1, 3
+	clrrdi	rSTR2, rSTR2, 3
+	subfic	r10, r12, 8
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
+	sldi	r9, r10, 3
+	subfic	r9, r9, 64
+	sld	rWORD1, rWORD1, r9
+	sld	rWORD2, rWORD2, r9
+	cmpld	cr6, rWORD1, rWORD2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+	bne	cr6, L(ret_diff)
+	subf	rN, r10, rN
+
+	cmpld	cr6, r11, r12
+	bgt	cr6, L(qw_align)
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
+	cmpld	cr6, rWORD1, rWORD2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+	bne	cr6, L(different)
+	cmpldi	cr6, rN, 8
+	ble	cr6, L(zeroLength)
+	addi	rN, rN, -8
+	/* Now both rSTR1 and rSTR2 are aligned to QW.  */
+	.align	4
+L(qw_align):
+	vspltisb	v0, 0
+	srdi.	r6, rN, 6
+	li	r8, 16
+	li	r10, 32
+	li	r11, 48
+	ble	cr0, L(lessthan64)
+	mtctr	r6
+	vspltisb	v8, 0
+	vspltisb	v6, 0
+	/* Aligned vector loop.  */
+	.align	4
+L(aligned_loop):
+	lvx	v4, 0, rSTR1
+	lvx	v5, 0, rSTR2
+	vcmpequb.	v7, v6, v8
+	bnl	cr6, L(different3)
+	lvx	v6, rSTR1, r8
+	lvx	v8, rSTR2, r8
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different2)
+	lvx	v4, rSTR1, r10
+	lvx	v5, rSTR2, r10
+	vcmpequb.	v7, v6, v8
+	bnl	cr6, L(different3)
+	lvx	v6, rSTR1, r11
+	lvx	v8, rSTR2, r11
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different2)
+	addi	rSTR1, rSTR1, 64
+	addi	rSTR2, rSTR2, 64
+	bdnz	L(aligned_loop)
+	vcmpequb.	v7, v6, v8
+	bnl	cr6, L(different3)
+	clrldi	rN, rN, 58
+	/* Handle remainder for aligned loop.  */
+	.align	4
+L(lessthan64):
+	mr	r9, rSTR1
+	cmpdi	cr6, rN, 0
+	li	rSTR1, 0
+	blelr	cr6
+	lvx	v4, 0, r9
+	lvx	v5, 0, rSTR2
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different1)
+	addi	rN, rN, -16
+
+	cmpdi	cr6, rN, 0
+	blelr	cr6
+	lvx	v4, r9, r8
+	lvx	v5, rSTR2, r8
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different1)
+	addi	rN, rN, -16
+
+	cmpdi	cr6, rN, 0
+	blelr	cr6
+	lvx	v4, r9, r10
+	lvx	v5, rSTR2, r10
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different1)
+	addi	rN, rN, -16
+
+	cmpdi	cr6, rN, 0
+	blelr	cr6
+	lvx	v4, r9, r11
+	lvx	v5, rSTR2, r11
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different1)
+	blr
+
+	/* Calculate and return the difference.  */
+	.align 4
+L(different1):
+	cmpdi	cr6, rN, 16
+	bge	cr6, L(different2)
+	/* Discard unwanted bytes.  */
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v1, 0, rN
+	vperm	v4, v4, v0, v1
+	vperm	v5, v5, v0, v1
+#else
+	lvsl	v1, 0, rN
+	vperm	v4, v0, v4, v1
+	vperm	v5, v0, v5, v1
+#endif
+	vcmpequb.	v7, v4, v5
+	li	rRTN, 0
+	bltlr	cr6
+	.align 4
+L(different2):
+#ifdef __LITTLE_ENDIAN__
+	/* Reverse bytes for direct comparison.  */
+	lvsl	v10, r0, r0
+	vspltisb	v8, 15
+	vsububm	v9, v8, v10
+	vperm	v4, v4, v0, v9
+	vperm	v5, v5, v0, v9
+#endif
+	MFVRD(r7, v4)
+	MFVRD(r9, v5)
+	cmpld	cr6, r7, r9
+	bne	cr6, L(ret_diff)
+	/* Difference in second DW.  */
+	vsldoi	v4, v4, v4, 8
+	vsldoi	v5, v5, v5, 8
+	MFVRD(r7, v4)
+	MFVRD(r9, v5)
+	cmpld	cr6, r7, r9
+L(ret_diff):
+	li	rRTN, 1
+	bgtlr	cr6
+	li	rRTN, -1
+	blr
+	.align	4
+L(different3):
+#ifdef __LITTLE_ENDIAN__
+	/* Reverse bytes for direct comparison.  */
+	vspltisb	v9, 15
+	lvsl	v10, r0, r0
+	vsububm	v9, v9, v10
+	vperm	v6, v6, v0, v9
+	vperm	v8, v8, v0, v9
+#endif
+	MFVRD(r7, v6)
+	MFVRD(r9, v8)
+	cmpld	cr6, r7, r9
+	bne	cr6, L(ret_diff)
+	/* Difference in second DW.  */
+	vsldoi	v6, v6, v6, 8
+	vsldoi	v8, v8, v8, 8
+	MFVRD(r7, v6)
+	MFVRD(r9, v8)
+	cmpld	cr6, r7, r9
+	li	rRTN, 1
+	bgtlr	cr6
+	li	rRTN, -1
+	blr
+
+	.align 4
+L(different):
+	cmpldi	cr7, rN, 8
+	bgt	cr7, L(end)
+	/* Skip unwanted bytes.  */
+	sldi	r8, rN, 3
+	subfic	r8, r8, 64
+	srd	rWORD1, rWORD1, r8
+	srd	rWORD2, rWORD2, r8
+	cmpld	cr6, rWORD1, rWORD2
+	li	rRTN, 0
+	beqlr	cr6
+L(end):
+	li	rRTN, 1
+	bgtlr	cr6
+	li	rRTN, -1
+	blr
+
+	.align	4
+L(unalignedqw):
+	/* Proceed to DW unaligned loop,if there is a chance of pagecross.  */
+	rldicl	r9, rSTR1, 0, 52
+	add	r9, r9, rN
+	cmpldi	cr0, r9, 4096-16
+	bgt	cr0, L(unaligned)
+	rldicl	r9, rSTR2, 0, 52
+	add	r9, r9, rN
+	cmpldi	cr0, r9, 4096-16
+	bgt	cr0, L(unaligned)
+	li	r0, 0
+	li	r8, 16
+	vspltisb	v0, 0
+	/* Check if rSTR1 is aligned to QW.  */
+	andi.	r11, rSTR1, 0xF
+	beq	L(s1_align)
+
+	/* Compare 16B and align S1 to QW.  */
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v10, 0, rSTR1	/* Compute mask.  */
+	lvsr	v6, 0, rSTR2	/* Compute mask.  */
+#else
+	lvsl	v10, 0, rSTR1	/* Compute mask.  */
+	lvsl	v6, 0, rSTR2	/* Compute mask.  */
+#endif
+	lvx	v5, 0, rSTR2
+	lvx	v9, rSTR2, r8
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v9, v5, v6
+#else
+	vperm	v5, v5, v9, v6
+#endif
+	lvx	v4, 0, rSTR1
+	lvx	v9, rSTR1, r8
+#ifdef __LITTLE_ENDIAN__
+	vperm	v4, v9, v4, v10
+#else
+	vperm	v4, v4, v9, v10
+#endif
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different1)
+	cmpldi	cr6, rN, 16
+	ble	cr6, L(zeroLength)
+	subfic	r11, r11, 16
+	subf	rN, r11, rN
+	add	rSTR1, rSTR1, r11
+	add	rSTR2, rSTR2, r11
+
+	/* As s1 is QW aligned prepare for unaligned loop.  */
+	.align	4
+L(s1_align):
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v6, 0, rSTR2
+#else
+	lvsl	v6, 0, rSTR2
+#endif
+	lvx	v5, 0, rSTR2
+	srdi.	r6, rN, 6
+	li	r10, 32
+	li	r11, 48
+	ble	cr0, L(lessthan64_unalign)
+	mtctr	r6
+	li 	r9, 64
+	/* Unaligned vector loop.  */
+	.align	4
+L(unalign_qwloop):
+	lvx	v4, 0, rSTR1
+	lvx	v10, rSTR2, r8
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v10, v5, v6
+#else
+	vperm	v5, v5, v10, v6
+#endif
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different2)
+	vor	v5, v10, v10
+	lvx	v4, rSTR1, r8
+	lvx	v10, rSTR2, r10
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v10, v5, v6
+#else
+	vperm	v5, v5, v10, v6
+#endif
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different2)
+	vor	v5, v10, v10
+	lvx	v4, rSTR1, r10
+	lvx	v10, rSTR2, r11
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v10, v5, v6
+#else
+	vperm	v5, v5, v10, v6
+#endif
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different2)
+	vor	v5, v10, v10
+	lvx	v4, rSTR1, r11
+	lvx	v10, rSTR2, r9
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v10, v5, v6
+#else
+	vperm	v5, v5, v10, v6
+#endif
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different2)
+	vor	v5, v10, v10
+	addi	rSTR1, rSTR1, 64
+	addi	rSTR2, rSTR2, 64
+	bdnz	L(unalign_qwloop)
+	clrldi	rN, rN, 58
+	/* Handle remainder for unaligned loop.  */
+	.align	4
+L(lessthan64_unalign):
+	mr	r9, rSTR1
+	cmpdi	cr6, rN, 0
+	li	rSTR1, 0
+	blelr	cr6
+	lvx	v4, 0, r9
+	lvx     v10, rSTR2, r8
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v10, v5, v6
+#else
+	vperm	v5, v5, v10, v6
+#endif
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different1)
+	vor	v5, v10, v10
+	addi	rN, rN, -16
+
+	cmpdi	cr6, rN, 0
+	blelr	cr6
+	lvx	v4, r9, r8
+	lvx	v10, rSTR2, r10
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v10, v5, v6
+#else
+	vperm	v5, v5, v10, v6
+#endif
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different1)
+	vor	v5, v10, v10
+	addi	rN, rN, -16
+
+	cmpdi	cr6, rN, 0
+	blelr	cr6
+	lvx	v4, r9, r10
+	lvx	v10, rSTR2, r11
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v10, v5, v6
+#else
+	vperm	v5, v5, v10, v6
+#endif
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different1)
+	vor	v5, v10, v10
+	addi	rN, rN, -16
+
+	cmpdi	cr6, rN, 0
+	blelr	cr6
+	lvx	v4, r9, r11
+	addi	r11, r11, 16
+	lvx	v10, rSTR2, r11
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v10, v5, v6
+#else
+	vperm	v5, v5, v10, v6
+#endif
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different1)
+	blr
+
+/* Otherwise we know the two strings have the same alignment (but not
+   yet DW).  So we force the string addresses to the next lower DW
+   boundary and special case this first DW using shift left to
+   eliminate bits preceding the first byte.  Since we want to join the
+   normal (DW aligned) compare loop, starting at the second double word,
+   we need to adjust the length (rN) and special case the loop
+   versioning for the first DW.  This ensures that the loop count is
+   correct and the first DW (shifted) is in the expected register pair.  */
+	.align	4
+L(DW):
+	std	rWORD8, rWORD8SAVE(r1)
+	std	rWORD7, rWORD7SAVE(r1)
+	std	rOFF8, rOFF8SAVE(r1)
+	std	rOFF16, rOFF16SAVE(r1)
+	std	rOFF24, rOFF24SAVE(r1)
+	std	rOFF32, rOFF32SAVE(r1)
+	cfi_offset(rWORD8, rWORD8SAVE)
+	cfi_offset(rWORD7, rWORD7SAVE)
+	cfi_offset(rOFF8, rOFF8SAVE)
+	cfi_offset(rOFF16, rOFF16SAVE)
+	cfi_offset(rOFF24, rOFF24SAVE)
+	cfi_offset(rOFF32, rOFF32SAVE)
+
+	li	rOFF8,8
+	li	rOFF16,16
+	li	rOFF24,24
+	li	rOFF32,32
+	clrrdi	rSTR1, rSTR1, 3
+	clrrdi	rSTR2, rSTR2, 3
+	beq	cr5, L(DWaligned)
+	add	rN, rN, r12
+	sldi	rWORD6, r12, 3
+	srdi	r0, rN, 5	/* Divide by 32.  */
+	andi.	r12, rN, 24	/* Get the DW remainder.  */
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	clrldi	rN, rN, 61
+	beq	L(dPs4)
+	mtctr	r0
+	bgt	cr1, L(dPs3)
+	beq	cr1, L(dPs2)
+
+/* Remainder is 8.  */
+	.align	3
+L(dsP1):
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD2, rWORD6
+	cmpld	cr5, rWORD5, rWORD6
+	blt	cr7, L(dP1x)
+/* Do something useful in this cycle since we have to branch anyway.  */
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	b	L(dP1e)
+/* Remainder is 16.  */
+	.align	4
+L(dPs2):
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD2, rWORD6
+	cmpld	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP2x)
+/* Do something useful in this cycle since we have to branch anyway.  */
+	LD	rWORD7, rOFF8, rSTR1
+	LD	rWORD8, rOFF8, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	b	L(dP2e)
+/* Remainder is 24.  */
+	.align	4
+L(dPs3):
+	sld	rWORD3, rWORD1, rWORD6
+	sld	rWORD4, rWORD2, rWORD6
+	cmpld	cr1, rWORD3, rWORD4
+	b	L(dP3e)
+/* Count is a multiple of 32, remainder is 0.  */
+	.align	4
+L(dPs4):
+	mtctr	r0
+	sld	rWORD1, rWORD1, rWORD6
+	sld	rWORD2, rWORD2, rWORD6
+	cmpld	cr7, rWORD1, rWORD2
+	b	L(dP4e)
+
+/* At this point we know both strings are double word aligned and the
+   compare length is at least 8 bytes.  */
+	.align	4
+L(DWaligned):
+	andi.	r12, rN, 24	/* Get the DW remainder.  */
+	srdi	r0, rN, 5	/* Divide by 32.  */
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	clrldi	rN, rN, 61
+	beq	L(dP4)
+	bgt	cr1, L(dP3)
+	beq	cr1, L(dP2)
+
+/* Remainder is 8.  */
+	.align	4
+L(dP1):
+	mtctr	r0
+/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
+   (8-15 byte compare), we want to use only volatile registers.  This
+   means we can avoid restoring non-volatile registers since we did not
+   change any on the early exit path.  The key here is the non-early
+   exit path only cares about the condition code (cr5), not about which
+   register pair was used.  */
+	LD	rWORD5, 0, rSTR1
+	LD	rWORD6, 0, rSTR2
+	cmpld	cr5, rWORD5, rWORD6
+	blt	cr7, L(dP1x)
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+L(dP1e):
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5x)
+	bne	cr7, L(dLcr7x)
+
+	LD	rWORD7, rOFF32, rSTR1
+	LD	rWORD8, rOFF32, rSTR2
+	addi	rSTR1, rSTR1, 32
+	addi	rSTR2, rSTR2, 32
+	bne	cr1, L(dLcr1)
+	cmpld	cr5, rWORD7, rWORD8
+	bdnz	L(dLoop)
+	bne	cr6, L(dLcr6)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	.align	3
+L(dP1x):
+	sldi.	r12, rN, 3
+	bne	cr5, L(dLcr5x)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+	bne	L(d00)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 0
+	blr
+
+/* Remainder is 16.  */
+	.align	4
+L(dP2):
+	mtctr	r0
+	LD	rWORD5, 0, rSTR1
+	LD	rWORD6, 0, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP2x)
+	LD	rWORD7, rOFF8, rSTR1
+	LD	rWORD8, rOFF8, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+L(dP2e):
+	LD	rWORD1, rOFF16, rSTR1
+	LD	rWORD2, rOFF16, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	LD	rWORD3, rOFF24, rSTR1
+	LD	rWORD4, rOFF24, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+	bne	cr6, L(dLcr6)
+	bne	cr5, L(dLcr5)
+	b	L(dLoop2)
+	.align	4
+L(dP2x):
+	LD	rWORD3, rOFF8, rSTR1
+	LD	rWORD4, rOFF8, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	sldi.	r12, rN, 3
+	bne	cr6, L(dLcr6x)
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+	bne	cr1, L(dLcr1x)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+	bne	L(d00)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 0
+	blr
+
+/* Remainder is 24.  */
+	.align	4
+L(dP3):
+	mtctr	r0
+	LD	rWORD3, 0, rSTR1
+	LD	rWORD4, 0, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+L(dP3e):
+	LD	rWORD5, rOFF8, rSTR1
+	LD	rWORD6, rOFF8, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP3x)
+	LD	rWORD7, rOFF16, rSTR1
+	LD	rWORD8, rOFF16, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	LD	rWORD1, rOFF24, rSTR1
+	LD	rWORD2, rOFF24, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+	bne	cr1, L(dLcr1)
+	bne	cr6, L(dLcr6)
+	b	L(dLoop1)
+/* Again we are on a early exit path (24-31 byte compare), we want to
+   only use volatile registers and avoid restoring non-volatile
+   registers.  */
+	.align	4
+L(dP3x):
+	LD	rWORD1, rOFF16, rSTR1
+	LD	rWORD2, rOFF16, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	sldi.	r12, rN, 3
+	bne	cr1, L(dLcr1x)
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+	bne	cr6, L(dLcr6x)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+	bne	cr7, L(dLcr7x)
+	bne	L(d00)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 0
+	blr
+
+/* Count is a multiple of 32, remainder is 0.  */
+	.align	4
+L(dP4):
+	mtctr	r0
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+L(dP4e):
+	LD	rWORD3, rOFF8, rSTR1
+	LD	rWORD4, rOFF8, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	LD	rWORD5, rOFF16, rSTR1
+	LD	rWORD6, rOFF16, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	LD	rWORD7, rOFF24, rSTR1
+	LD	rWORD8, rOFF24, rSTR2
+	addi	rSTR1, rSTR1, 24
+	addi	rSTR2, rSTR2, 24
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr7, L(dLcr7)
+	bne	cr1, L(dLcr1)
+	bdz-	L(d24)		/* Adjust CTR as we start with +4.  */
+/* This is the primary loop.  */
+	.align	4
+L(dLoop):
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(dLcr6)
+L(dLoop1):
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5)
+L(dLoop2):
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr7, L(dLcr7)
+L(dLoop3):
+	LD	rWORD7, rOFF32, rSTR1
+	LD	rWORD8, rOFF32, rSTR2
+	addi	rSTR1, rSTR1, 32
+	addi	rSTR2, rSTR2, 32
+	bne	cr1, L(dLcr1)
+	cmpld	cr7, rWORD1, rWORD2
+	bdnz	L(dLoop)
+
+L(dL4):
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(dLcr6)
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5)
+	cmpld	cr5, rWORD7, rWORD8
+L(d44):
+	bne	cr7, L(dLcr7)
+L(d34):
+	bne	cr1, L(dLcr1)
+L(d24):
+	bne	cr6, L(dLcr6)
+L(d14):
+	sldi.	r12, rN, 3
+	bne	cr5, L(dLcr5)
+L(d04):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+	beq	L(duzeroLength)
+/* At this point we have a remainder of 1 to 7 bytes to compare.  Since
+   we are aligned it is safe to load the whole double word, and use
+   shift right double to eliminate bits beyond the compare length.  */
+L(d00):
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	srd	rWORD1, rWORD1, rN
+	srd	rWORD2, rWORD2, rN
+	cmpld	cr7, rWORD1, rWORD2
+	bne	cr7, L(dLcr7x)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 0
+	blr
+
+	.align	4
+L(dLcr7):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+L(dLcr7x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 1
+	bgtlr	cr7
+	li	rRTN, -1
+	blr
+	.align	4
+L(dLcr1):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+L(dLcr1x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 1
+	bgtlr	cr1
+	li	rRTN, -1
+	blr
+	.align	4
+L(dLcr6):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+L(dLcr6x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 1
+	bgtlr	cr6
+	li	rRTN, -1
+	blr
+	.align	4
+L(dLcr5):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+L(dLcr5x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 1
+	bgtlr	cr5
+	li	rRTN, -1
+	blr
+
+	.align	4
+L(bytealigned):
+	mtctr	rN
+
+/* We need to prime this loop.  This loop is swing modulo scheduled
+   to avoid pipe delays.  The dependent instruction latencies (load to
+   compare to conditional branch) is 2 to 3 cycles.  In this loop each
+   dispatch group ends in a branch and takes 1 cycle.  Effectively
+   the first iteration of the loop only serves to load operands and
+   branches based on compares are delayed until the next loop.
+
+   So we must precondition some registers and condition codes so that
+   we don't exit the loop early on the first iteration.  */
+
+	lbz	rWORD1, 0(rSTR1)
+	lbz	rWORD2, 0(rSTR2)
+	bdz	L(b11)
+	cmpld	cr7, rWORD1, rWORD2
+	lbz	rWORD3, 1(rSTR1)
+	lbz	rWORD4, 1(rSTR2)
+	bdz	L(b12)
+	cmpld	cr1, rWORD3, rWORD4
+	lbzu	rWORD5, 2(rSTR1)
+	lbzu	rWORD6, 2(rSTR2)
+	bdz	L(b13)
+	.align	4
+L(bLoop):
+	lbzu	rWORD1, 1(rSTR1)
+	lbzu	rWORD2, 1(rSTR2)
+	bne	cr7, L(bLcr7)
+
+	cmpld	cr6, rWORD5, rWORD6
+	bdz	L(b3i)
+
+	lbzu	rWORD3, 1(rSTR1)
+	lbzu	rWORD4, 1(rSTR2)
+	bne	cr1, L(bLcr1)
+
+	cmpld	cr7, rWORD1, rWORD2
+	bdz	L(b2i)
+
+	lbzu	rWORD5, 1(rSTR1)
+	lbzu	rWORD6, 1(rSTR2)
+	bne	cr6, L(bLcr6)
+
+	cmpld	cr1, rWORD3, rWORD4
+	bdnz	L(bLoop)
+
+/* We speculatively loading bytes before we have tested the previous
+   bytes.  But we must avoid overrunning the length (in the ctr) to
+   prevent these speculative loads from causing a segfault.  In this
+   case the loop will exit early (before the all pending bytes are
+   tested.  In this case we must complete the pending operations
+   before returning.  */
+L(b1i):
+	bne	cr7, L(bLcr7)
+	bne	cr1, L(bLcr1)
+	b	L(bx56)
+	.align	4
+L(b2i):
+	bne	cr6, L(bLcr6)
+	bne	cr7, L(bLcr7)
+	b	L(bx34)
+	.align	4
+L(b3i):
+	bne	cr1, L(bLcr1)
+	bne	cr6, L(bLcr6)
+	b	L(bx12)
+	.align	4
+L(bLcr7):
+	li	rRTN, 1
+	bgtlr	cr7
+	li	rRTN, -1
+	blr
+L(bLcr1):
+	li	rRTN, 1
+	bgtlr	cr1
+	li	rRTN, -1
+	blr
+L(bLcr6):
+	li	rRTN, 1
+	bgtlr	cr6
+	li	rRTN, -1
+	blr
+
+L(b13):
+	bne	cr7, L(bx12)
+	bne	cr1, L(bx34)
+L(bx56):
+	sub	rRTN, rWORD5, rWORD6
+	blr
+	nop
+L(b12):
+	bne	cr7, L(bx12)
+L(bx34):
+	sub	rRTN, rWORD3, rWORD4
+	blr
+L(b11):
+L(bx12):
+	sub	rRTN, rWORD1, rWORD2
+	blr
+
+	.align	4
+L(zeroLength):
+	li	rRTN, 0
+	blr
+
+	.align	4
+/* At this point we know the strings have different alignment and the
+   compare length is at least 8 bytes.  r12 contains the low order
+   3 bits of rSTR1 and cr5 contains the result of the logical compare
+   of r12 to 0.  If r12 == 0 then rStr1 is double word
+   aligned and can perform the DWunaligned loop.
+
+   Otherwise we know that rSTR1 is not already DW aligned yet.
+   So we can force the string addresses to the next lower DW
+   boundary and special case this first DW using shift left to
+   eliminate bits preceding the first byte.  Since we want to join the
+   normal (DWaligned) compare loop, starting at the second double word,
+   we need to adjust the length (rN) and special case the loop
+   versioning for the first DW.  This ensures that the loop count is
+   correct and the first DW (shifted) is in the expected resister pair.  */
+L(unaligned):
+	std	rWORD8, rWORD8SAVE(r1)
+	std	rWORD7, rWORD7SAVE(r1)
+	std	rOFF8, rOFF8SAVE(r1)
+	std	rOFF16, rOFF16SAVE(r1)
+	std	rOFF24, rOFF24SAVE(r1)
+	std	rOFF32, rOFF32SAVE(r1)
+	cfi_offset(rWORD8, rWORD8SAVE)
+	cfi_offset(rWORD7, rWORD7SAVE)
+	cfi_offset(rOFF8, rOFF8SAVE)
+	cfi_offset(rOFF16, rOFF16SAVE)
+	cfi_offset(rOFF24, rOFF24SAVE)
+	cfi_offset(rOFF32, rOFF32SAVE)
+	li	rOFF8,8
+	li	rOFF16,16
+	li	rOFF24,24
+	li	rOFF32,32
+	std	rSHL, rSHLSAVE(r1)
+	cfi_offset(rSHL, rSHLSAVE)
+	clrldi	rSHL, rSTR2, 61
+	beq	cr6, L(duzeroLength)
+	std	rSHR, rSHRSAVE(r1)
+	cfi_offset(rSHR, rSHRSAVE)
+	beq	cr5, L(DWunaligned)
+	std	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
+/* Adjust the logical start of rSTR2 to compensate for the extra bits
+   in the 1st rSTR1 DW.  */
+	sub	rWORD8_SHIFT, rSTR2, r12
+/* But do not attempt to address the DW before that DW that contains
+   the actual start of rSTR2.  */
+	clrrdi	rSTR2, rSTR2, 3
+	std	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+/* Compute the left/right shift counts for the unaligned rSTR2,
+   compensating for the logical (DW aligned) start of rSTR1.  */
+	clrldi	rSHL, rWORD8_SHIFT, 61
+	clrrdi	rSTR1, rSTR1, 3
+	std	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+	sldi	rSHL, rSHL, 3
+	cmpld	cr5, rWORD8_SHIFT, rSTR2
+	add	rN, rN, r12
+	sldi	rWORD6, r12, 3
+	std	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
+	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
+	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
+	subfic	rSHR, rSHL, 64
+	srdi	r0, rN, 5	/* Divide by 32.  */
+	andi.	r12, rN, 24	/* Get the DW remainder.  */
+/* We normally need to load 2 DWs to start the unaligned rSTR2, but in
+   this special case those bits may be discarded anyway.  Also we
+   must avoid loading a DW where none of the bits are part of rSTR2 as
+   this may cross a page boundary and cause a page fault.  */
+	li	rWORD8, 0
+	blt	cr5, L(dus0)
+	LD	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+	sld	rWORD8, rWORD8, rSHL
+
+L(dus0):
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	srd	r12, rWORD2, rSHR
+	clrldi	rN, rN, 61
+	beq	L(duPs4)
+	mtctr	r0
+	or	rWORD8, r12, rWORD8
+	bgt	cr1, L(duPs3)
+	beq	cr1, L(duPs2)
+
+/* Remainder is 8.  */
+	.align	4
+L(dusP1):
+	sld	rWORD8_SHIFT, rWORD2, rSHL
+	sld	rWORD7, rWORD1, rWORD6
+	sld	rWORD8, rWORD8, rWORD6
+	bge	cr7, L(duP1e)
+/* At this point we exit early with the first double word compare
+   complete and remainder of 0 to 7 bytes.  See L(du14) for details on
+   how we handle the remaining bytes.  */
+	cmpld	cr5, rWORD7, rWORD8
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+	LD	rWORD2, rOFF8, rSTR2
+	srd	r0, rWORD2, rSHR
+	b	L(dutrim)
+/* Remainder is 16.  */
+	.align	4
+L(duPs2):
+	sld	rWORD6_SHIFT, rWORD2, rSHL
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD8, rWORD6
+	b	L(duP2e)
+/* Remainder is 24.  */
+	.align	4
+L(duPs3):
+	sld	rWORD4_SHIFT, rWORD2, rSHL
+	sld	rWORD3, rWORD1, rWORD6
+	sld	rWORD4, rWORD8, rWORD6
+	b	L(duP3e)
+/* Count is a multiple of 32, remainder is 0.  */
+	.align	4
+L(duPs4):
+	mtctr	r0
+	or	rWORD8, r12, rWORD8
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	sld	rWORD1, rWORD1, rWORD6
+	sld	rWORD2, rWORD8, rWORD6
+	b	L(duP4e)
+
+/* At this point we know rSTR1 is double word aligned and the
+   compare length is at least 8 bytes.  */
+	.align	4
+L(DWunaligned):
+	std	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	clrrdi	rSTR2, rSTR2, 3
+	std	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	srdi	r0, rN, 5	/* Divide by 32.  */
+	std	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+	andi.	r12, rN, 24	/* Get the DW remainder.  */
+	std	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
+	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
+	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
+	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
+	sldi	rSHL, rSHL, 3
+	LD	rWORD6, 0, rSTR2
+	LD	rWORD8, rOFF8, rSTR2
+	addi	rSTR2, rSTR2, 8
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	clrldi	rN, rN, 61
+	subfic	rSHR, rSHL, 64
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	beq	L(duP4)
+	mtctr	r0
+	bgt	cr1, L(duP3)
+	beq	cr1, L(duP2)
+
+/* Remainder is 8.  */
+	.align	4
+L(duP1):
+	srd	r12, rWORD8, rSHR
+	LD	rWORD7, 0, rSTR1
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP1x)
+L(duP1e):
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	bne	cr5, L(duLcr5)
+	or	rWORD4, r12, rWORD2_SHIFT
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	bne	cr7, L(duLcr7)
+	or	rWORD6, r0, rWORD4_SHIFT
+	cmpld	cr6, rWORD5, rWORD6
+	b	L(duLoop3)
+	.align	4
+/* At this point we exit early with the first double word compare
+   complete and remainder of 0 to 7 bytes.  See L(du14) for details on
+   how we handle the remaining bytes.  */
+L(duP1x):
+	cmpld	cr5, rWORD7, rWORD8
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+	LD	rWORD2, rOFF8, rSTR2
+	srd	r0, rWORD2, rSHR
+	b	L(dutrim)
+/* Remainder is 16.  */
+	.align	4
+L(duP2):
+	srd	r0, rWORD8, rSHR
+	LD	rWORD5, 0, rSTR1
+	or	rWORD6, r0, rWORD6_SHIFT
+	sld	rWORD6_SHIFT, rWORD8, rSHL
+L(duP2e):
+	LD	rWORD7, rOFF8, rSTR1
+	LD	rWORD8, rOFF8, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP2x)
+	LD	rWORD1, rOFF16, rSTR1
+	LD	rWORD2, rOFF16, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+	LD	rWORD3, rOFF24, rSTR1
+	LD	rWORD4, rOFF24, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	bne	cr5, L(duLcr5)
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+	cmpld	cr1, rWORD3, rWORD4
+	b	L(duLoop2)
+	.align	4
+L(duP2x):
+	cmpld	cr5, rWORD7, rWORD8
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+	bne	cr6, L(duLcr6)
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+	LD	rWORD2, rOFF8, rSTR2
+	srd	r0, rWORD2, rSHR
+	b	L(dutrim)
+
+/* Remainder is 24.  */
+	.align	4
+L(duP3):
+	srd	r12, rWORD8, rSHR
+	LD	rWORD3, 0, rSTR1
+	sld	rWORD4_SHIFT, rWORD8, rSHL
+	or	rWORD4, r12, rWORD6_SHIFT
+L(duP3e):
+	LD	rWORD5, rOFF8, rSTR1
+	LD	rWORD6, rOFF8, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+	LD	rWORD7, rOFF16, rSTR1
+	LD	rWORD8, rOFF16, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr1, L(duLcr1)
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP3x)
+	LD	rWORD1, rOFF24, rSTR1
+	LD	rWORD2, rOFF24, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+	cmpld	cr7, rWORD1, rWORD2
+	b	L(duLoop1)
+	.align	4
+L(duP3x):
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+	LD	rWORD2, rOFF8, rSTR2
+	srd	r0, rWORD2, rSHR
+	b	L(dutrim)
+
+/* Count is a multiple of 32, remainder is 0.  */
+	.align	4
+L(duP4):
+	mtctr	r0
+	srd	r0, rWORD8, rSHR
+	LD	rWORD1, 0, rSTR1
+	sld	rWORD2_SHIFT, rWORD8, rSHL
+	or	rWORD2, r0, rWORD6_SHIFT
+L(duP4e):
+	LD	rWORD3, rOFF8, rSTR1
+	LD	rWORD4, rOFF8, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+	LD	rWORD5, rOFF16, rSTR1
+	LD	rWORD6, rOFF16, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr7, L(duLcr7)
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+	LD	rWORD7, rOFF24, rSTR1
+	LD	rWORD8, rOFF24, rSTR2
+	addi	rSTR1, rSTR1, 24
+	addi	rSTR2, rSTR2, 24
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr1, L(duLcr1)
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	cmpld	cr5, rWORD7, rWORD8
+	bdz	L(du24)		/* Adjust CTR as we start with +4.  */
+/* This is the primary loop.  */
+	.align	4
+L(duLoop):
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(duLcr6)
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+L(duLoop1):
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(duLcr5)
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+L(duLoop2):
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr7, L(duLcr7)
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+L(duLoop3):
+	LD	rWORD7, rOFF32, rSTR1
+	LD	rWORD8, rOFF32, rSTR2
+	addi	rSTR1, rSTR1, 32
+	addi	rSTR2, rSTR2, 32
+	cmpld	cr7, rWORD1, rWORD2
+	bne	cr1, L(duLcr1)
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	bdnz	L(duLoop)
+
+L(duL4):
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(duLcr6)
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(duLcr5)
+	cmpld	cr5, rWORD7, rWORD8
+L(du44):
+	bne	cr7, L(duLcr7)
+L(du34):
+	bne	cr1, L(duLcr1)
+L(du24):
+	bne	cr6, L(duLcr6)
+L(du14):
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+/* At this point we have a remainder of 1 to 7 bytes to compare.  We use
+   shift right double to eliminate bits beyond the compare length.
+
+   However it may not be safe to load rWORD2 which may be beyond the
+   string length.  So we compare the bit length of the remainder to
+   the right shift count (rSHR).  If the bit count is less than or equal
+   we do not need to load rWORD2 (all significant bits are already in
+   rWORD8_SHIFT).  */
+	cmpld	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+	LD	rWORD2, rOFF8, rSTR2
+	srd	r0, rWORD2, rSHR
+	.align	4
+L(dutrim):
+	LD	rWORD1, rOFF8, rSTR1
+	ld	rWORD8, -8(r1)
+	subfic	rN, rN, 64	/* Shift count is 64 - (rN * 8).  */
+	or	rWORD2, r0, rWORD8_SHIFT
+	ld	rWORD7, rWORD7SAVE(r1)
+	ld	rSHL, rSHLSAVE(r1)
+	srd	rWORD1, rWORD1, rN
+	srd	rWORD2, rWORD2, rN
+	ld	rSHR, rSHRSAVE(r1)
+	ld	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	li	rRTN, 0
+	cmpld	cr7, rWORD1, rWORD2
+	ld	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	ld	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+	beq	cr7, L(dureturn24)
+	li	rRTN, 1
+	ld	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	bgtlr	cr7
+	li	rRTN, -1
+	blr
+	.align	4
+L(duLcr7):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	li	rRTN, 1
+	bgt	cr7, L(dureturn29)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
+	li	rRTN, -1
+	b	L(dureturn27)
+	.align	4
+L(duLcr1):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	li	rRTN, 1
+	bgt	cr1, L(dureturn29)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
+	li	rRTN, -1
+	b	L(dureturn27)
+	.align	4
+L(duLcr6):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	li	rRTN, 1
+	bgt	cr6, L(dureturn29)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
+	li	rRTN, -1
+	b	L(dureturn27)
+	.align	4
+L(duLcr5):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	li	rRTN, 1
+	bgt	cr5, L(dureturn29)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
+	li	rRTN, -1
+	b	L(dureturn27)
+
+	.align	3
+L(duZeroReturn):
+	li	rRTN, 0
+	.align	4
+L(dureturn):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+L(dureturn29):
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
+L(dureturn27):
+	ld	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	ld	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	ld	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+L(dureturn24):
+	ld	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	blr
+
+L(duzeroLength):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 0
+	blr
+
+END (MEMCMP)
+libc_hidden_builtin_def (memcmp)
+weak_alias (memcmp, bcmp)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S
new file mode 100644
index 0000000000..bc734c9f4f
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S
@@ -0,0 +1,458 @@
+/* Optimized memset implementation for PowerPC64/POWER8.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define MTVSRD_V1_R4  .long 0x7c240166     /* mtvsrd  v1,r4  */
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+   Returns 's'.  */
+
+#ifndef MEMSET
+# define MEMSET memset
+#endif
+
+	/* No need to use .machine power8 since mtvsrd is already
+	   handled by the define.  It avoid breakage on binutils
+	   that does not support this machine specifier.  */
+	.machine power7
+EALIGN (MEMSET, 5, 0)
+	CALL_MCOUNT 3
+
+L(_memset):
+	cmpldi	cr7,r5,31
+	neg	r0,r3
+	mr	r10,r3
+
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32	/* Replicate byte to word.  */
+	ble	cr7,L(write_LT_32)
+
+	andi.	r11,r10,15	/* Check alignment of DST.  */
+	insrdi	r4,r4,32,0	/* Replicate word to double word.  */
+
+	beq	L(big_aligned)
+
+	mtocrf	0x01,r0
+	clrldi	r0,r0,60
+
+	/* Get DST aligned to 16 bytes.  */
+1:	bf	31,2f
+	stb	r4,0(r10)
+	addi	r10,r10,1
+
+2:	bf	30,4f
+	sth	r4,0(r10)
+	addi	r10,r10,2
+
+4:	bf	29,8f
+	stw	r4,0(r10)
+	addi	r10,r10,4
+
+8:	bf      28,16f
+	std     r4,0(r10)
+	addi    r10,r10,8
+
+16:	subf	r5,r0,r5
+
+	.align	4
+L(big_aligned):
+	/* For sizes larger than 255 two possible paths:
+	   - if constant is '0', zero full cache lines with dcbz
+	   - otherwise uses vector instructions.  */
+	cmpldi	cr5,r5,255
+	dcbtst	0,r10
+	cmpldi	cr6,r4,0
+	crand	27,26,21
+	bt	27,L(huge_dcbz)
+	bge	cr5,L(huge_vector)
+
+
+	/* Size between 32 and 255 bytes with constant different than 0, use
+	   doubleword store instruction to achieve best throughput.  */
+	srdi    r8,r5,5
+	clrldi  r11,r5,59
+	cmpldi  cr6,r11,0
+	cmpdi	r8,0
+	beq     L(tail_bytes)
+	mtctr   r8
+
+	/* Main aligned write loop, writes 32-bytes at a time.  */
+	.align  4
+L(big_loop):
+	std     r4,0(r10)
+	std     r4,8(r10)
+	std     r4,16(r10)
+	std     r4,24(r10)
+	addi    r10,r10,32
+	bdz     L(tail_bytes)
+
+	std     r4,0(r10)
+	std     r4,8(r10)
+	std     r4,16(r10)
+	std     r4,24(r10)
+	addi    r10,10,32
+	bdnz    L(big_loop)
+
+	b       L(tail_bytes)
+
+	/* Write remaining 1~31 bytes.  */
+	.align  4
+L(tail_bytes):
+	beqlr   cr6
+
+	srdi    r7,r11,4
+	clrldi  r8,r11,60
+	mtocrf  0x01,r7
+
+	.align	4
+	bf	31,8f
+	std	r4,0(r10)
+	std	r4,8(r10)
+	addi	r10,r10,16
+
+	.align	4
+8:	mtocrf	0x1,r8
+	bf	28,4f
+	std	r4,0(r10)
+	addi	r10,r10,8
+
+	.align	4
+4:	bf      29,2f
+	stw     4,0(10)
+	addi    10,10,4
+
+	.align 	4
+2:	bf      30,1f
+	sth     4,0(10)
+	addi    10,10,2
+
+	.align  4
+1:      bflr    31
+	stb     4,0(10)
+	blr
+
+	/* Size larger than 255 bytes with constant different than 0, use
+	   vector instruction to achieve best throughput.  */
+L(huge_vector):
+	/* Replicate set byte to quadword in VMX register.  */
+	MTVSRD_V1_R4
+	xxpermdi 32,v0,v1,0
+	vspltb	 v2,v0,15
+
+	/* Main aligned write loop: 128 bytes at a time.  */
+	li	r6,16
+	li	r7,32
+	li	r8,48
+	mtocrf	0x02,r5
+	srdi	r12,r5,7
+	cmpdi	r12,0
+	beq	L(aligned_tail)
+	mtctr	r12
+	b	L(aligned_128loop)
+
+	.align  4
+L(aligned_128loop):
+	stvx	v2,0,r10
+	stvx	v2,r10,r6
+	stvx	v2,r10,r7
+	stvx	v2,r10,r8
+	addi	r10,r10,64
+	stvx	v2,0,r10
+	stvx	v2,r10,r6
+	stvx	v2,r10,r7
+	stvx	v2,r10,r8
+	addi	r10,r10,64
+	bdnz	L(aligned_128loop)
+
+	/* Write remaining 1~127 bytes.  */
+L(aligned_tail):
+	mtocrf	0x01,r5
+	bf	25,32f
+	stvx	v2,0,r10
+	stvx	v2,r10,r6
+	stvx	v2,r10,r7
+	stvx	v2,r10,r8
+	addi	r10,r10,64
+
+32:	bf	26,16f
+	stvx	v2,0,r10
+	stvx	v2,r10,r6
+	addi	r10,r10,32
+
+16:	bf	27,8f
+	stvx	v2,0,r10
+	addi	r10,r10,16
+
+8:	bf	28,4f
+	std     r4,0(r10)
+	addi	r10,r10,8
+
+	/* Copies 4~7 bytes.  */
+4:	bf	29,L(tail2)
+	stw     r4,0(r10)
+	bf      30,L(tail5)
+	sth     r4,4(r10)
+	bflr	31
+	stb     r4,6(r10)
+	/* Return original DST pointer.  */
+	blr
+
+	/* Special case when value is 0 and we have a long length to deal
+	   with.  Use dcbz to zero out a full cacheline of 128 bytes at a time.
+	   Before using dcbz though, we need to get the destination 128-byte
+	   aligned.  */
+	.align	4
+L(huge_dcbz):
+	andi.	r11,r10,127
+	neg	r0,r10
+	beq	L(huge_dcbz_aligned)
+
+	clrldi	r0,r0,57
+	subf	r5,r0,r5
+	srdi	r0,r0,3
+	mtocrf	0x01,r0
+
+	/* Write 1~128 bytes until DST is aligned to 128 bytes.  */
+8:	bf	28,4f
+
+	std	r4,0(r10)
+	std	r4,8(r10)
+	std	r4,16(r10)
+	std	r4,24(r10)
+	std	r4,32(r10)
+	std	r4,40(r10)
+	std	r4,48(r10)
+	std	r4,56(r10)
+	addi	r10,r10,64
+
+	.align	4
+4:	bf	29,2f
+	std	r4,0(r10)
+	std	r4,8(r10)
+	std	r4,16(r10)
+	std	r4,24(r10)
+	addi	r10,r10,32
+
+	.align	4
+2:	bf	30,1f
+	std	r4,0(r10)
+	std	r4,8(r10)
+	addi	r10,r10,16
+
+	.align	4
+1:	bf	31,L(huge_dcbz_aligned)
+	std	r4,0(r10)
+	addi	r10,r10,8
+
+L(huge_dcbz_aligned):
+	/* Setup dcbz unroll offsets and count numbers.  */
+	srdi	r8,r5,9
+	clrldi	r11,r5,55
+	cmpldi	cr6,r11,0
+	li	r9,128
+	cmpdi	r8,0
+	beq     L(huge_tail)
+	li	r7,256
+	li	r6,384
+	mtctr	r8
+
+	.align	4
+L(huge_loop):
+	/* Sets 512 bytes to zero in each iteration, the loop unrolling shows
+	   a throughput boost for large sizes (2048 bytes or higher).  */
+	dcbz	0,r10
+	dcbz	r9,r10
+	dcbz	r7,r10
+	dcbz	r6,r10
+	addi	r10,r10,512
+	bdnz	L(huge_loop)
+
+	beqlr	cr6
+
+L(huge_tail):
+	srdi    r6,r11,8
+	srdi    r7,r11,4
+	clrldi  r8,r11,4
+	cmpldi  cr6,r8,0
+	mtocrf  0x01,r6
+
+	beq	cr6,L(tail)
+
+	/* We have 1~511 bytes remaining.  */
+	.align	4
+32:	bf	31,16f
+	dcbz	0,r10
+	dcbz	r9,r10
+	addi	r10,r10,256
+
+	.align	4
+16:	mtocrf  0x01,r7
+	bf	28,8f
+	dcbz	0,r10
+	addi	r10,r10,128
+
+	.align 	4
+8:	bf	29,4f
+	std	r4,0(r10)
+	std	r4,8(r10)
+	std	r4,16(r10)
+	std	r4,24(r10)
+	std	r4,32(r10)
+	std	r4,40(r10)
+	std	r4,48(r10)
+	std	r4,56(r10)
+	addi	r10,r10,64
+
+	.align	4
+4:	bf	30,2f
+	std	r4,0(r10)
+	std	r4,8(r10)
+	std	r4,16(r10)
+	std	r4,24(r10)
+	addi	r10,r10,32
+
+	.align	4
+2:	bf	31,L(tail)
+	std	r4,0(r10)
+	std	r4,8(r10)
+	addi	r10,r10,16
+	.align	4
+
+	/* Remaining 1~15 bytes.  */
+L(tail):
+	mtocrf  0x01,r8
+
+	.align
+8:	bf	28,4f
+	std	r4,0(r10)
+	addi	r10,r10,8
+
+	.align	4
+4:	bf	29,2f
+	stw	r4,0(r10)
+	addi	r10,r10,4
+
+	.align	4
+2:	bf	30,1f
+	sth	r4,0(r10)
+	addi	r10,r10,2
+
+	.align	4
+1:	bflr	31
+	stb	r4,0(r10)
+	blr
+
+	/* Handle short copies of 0~31 bytes.  Best throughput is achieved
+	   by just unrolling all operations.  */
+	.align	4
+L(write_LT_32):
+	cmpldi	cr6,5,8
+	mtocrf	0x01,r5
+	ble	cr6,L(write_LE_8)
+
+	/* At least 9 bytes to go.  */
+	neg	r8,r4
+	andi.	r0,r8,3
+	cmpldi	cr1,r5,16
+	beq	L(write_LT_32_aligned)
+
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,r0
+	subf	r5,r0,r5
+
+2:	bf	30,1f
+	sth	r4,0(r10)
+	addi	r10,r10,2
+
+1:	bf	31,L(end_4bytes_alignment)
+	stb	r4,0(r10)
+	addi	r10,r10,1
+
+	.align	4
+L(end_4bytes_alignment):
+	cmpldi	cr1,r5,16
+	mtocrf	0x01,r5
+
+L(write_LT_32_aligned):
+	blt	cr1,8f
+
+	stw	r4,0(r10)
+	stw	r4,4(r10)
+	stw	r4,8(r10)
+	stw	r4,12(r10)
+	addi	r10,r10,16
+
+8:	bf	28,L(tail4)
+	stw	r4,0(r10)
+	stw	r4,4(r10)
+	addi	r10,r10,8
+
+	.align	4
+	/* Copies 4~7 bytes.  */
+L(tail4):
+	bf	29,L(tail2)
+	stw	r4,0(r10)
+	bf	30,L(tail5)
+	sth	r4,4(r10)
+	bflr	31
+	stb	r4,6(r10)
+	blr
+
+	.align	4
+	/* Copies 2~3 bytes.  */
+L(tail2):
+	bf	30,1f
+	sth	r4,0(r10)
+	bflr	31
+	stb	r4,2(r10)
+	blr
+
+	.align	4
+L(tail5):
+	bflr	31
+	stb	r4,4(r10)
+	blr
+
+	.align	4
+1: 	bflr	31
+	stb	r4,0(r10)
+	blr
+
+	/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(write_LE_8):
+	bne	cr6,L(tail4)
+
+	stw	r4,0(r10)
+	stw	r4,4(r10)
+	blr
+END_GEN_TB (MEMSET,TB_TOCLESS)
+libc_hidden_builtin_def (memset)
+
+/* Copied from bzero.S to prevent the linker from inserting a stub
+   between bzero and memset.  */
+ENTRY (__bzero)
+	CALL_MCOUNT 3
+	mr	r5,r4
+	li	r4,0
+	b	L(_memset)
+END (__bzero)
+#ifndef __bzero
+weak_alias (__bzero, bzero)
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/multiarch/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/multiarch/Implies
new file mode 100644
index 0000000000..1fc7b7cd39
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/multiarch/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power7/multiarch
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/stpcpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/stpcpy.S
new file mode 100644
index 0000000000..955e738cee
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/stpcpy.S
@@ -0,0 +1,24 @@
+/* Optimized stpcpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STPCPY
+#include <sysdeps/powerpc/powerpc64/power8/strcpy.S>
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/stpncpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/stpncpy.S
new file mode 100644
index 0000000000..c14d984dd0
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/stpncpy.S
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasecmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasecmp.S
new file mode 100644
index 0000000000..88b17a6eb1
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasecmp.S
@@ -0,0 +1,457 @@
+/* Optimized strcasecmp implementation for PowerPC64.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <locale-defines.h>
+
+/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) */
+
+#ifndef USE_AS_STRNCASECMP
+#  define __STRCASECMP __strcasecmp
+#  define STRCASECMP   strcasecmp
+#else
+#  define __STRCASECMP __strncasecmp
+#  define STRCASECMP   strncasecmp
+#endif
+/* Convert 16 bytes to lowercase and compare */
+#define TOLOWER()     \
+	vaddubm	v8, v4, v1; \
+	vaddubm	v7, v4, v3; \
+	vcmpgtub	v8, v8, v2; \
+	vsel	v4, v7, v4, v8; \
+	vaddubm	v8, v5, v1; \
+	vaddubm	v7, v5, v3; \
+	vcmpgtub	v8, v8, v2; \
+	vsel	v5, v7, v5, v8; \
+	vcmpequb.	v7, v5, v4;
+
+/*
+ * Get 16 bytes for unaligned case.
+ * reg1: Vector to hold next 16 bytes.
+ * reg2: Address to read from.
+ * reg3: Permute control vector.
+ * v8: Tmp vector used to mask unwanted bytes.
+ * v9: Tmp vector,0 when null is found on first 16 bytes
+ */
+#ifdef __LITTLE_ENDIAN__
+#define GET16BYTES(reg1, reg2, reg3) \
+	lvx	reg1, 0, reg2; \
+	vspltisb	v8, -1; \
+	vperm	v8, v8, reg1, reg3; \
+	vcmpequb.	v8, v0, v8; \
+	beq	cr6, 1f; \
+	vspltisb	v9, 0; \
+	b	2f; \
+	.align 4; \
+1: \
+	addi	r6, reg2, 16; \
+	lvx	v9, 0, r6; \
+2: \
+	vperm	reg1, v9, reg1, reg3;
+#else
+#define GET16BYTES(reg1, reg2, reg3) \
+	lvx	reg1, 0, reg2; \
+	vspltisb	 v8, -1; \
+	vperm	v8, reg1, v8,  reg3; \
+	vcmpequb.	v8, v0, v8; \
+	beq	cr6, 1f; \
+	vspltisb	v9, 0; \
+	b	2f; \
+	.align 4; \
+1: \
+	addi	r6, reg2, 16; \
+	lvx	v9, 0, r6; \
+2: \
+	vperm	reg1, reg1, v9, reg3;
+#endif
+
+/* Check null in v4, v5 and convert to lower.  */
+#define CHECKNULLANDCONVERT() \
+	vcmpequb.	v7, v0, v5; \
+	beq	cr6, 3f; \
+	vcmpequb.	v7, v0, v4; \
+	beq	cr6, 3f; \
+	b	L(null_found); \
+	.align  4; \
+3: \
+	TOLOWER()
+
+#ifdef _ARCH_PWR8
+#  define VCLZD_V8_v7	vclzd	v8, v7;
+#  define MFVRD_R3_V1	mfvrd	r3, v1;
+#  define VSUBUDM_V9_V8	vsubudm	v9, v9, v8;
+#  define VPOPCNTD_V8_V8	vpopcntd v8, v8;
+#  define VADDUQM_V7_V8	vadduqm	v9, v7, v8;
+#else
+#  define VCLZD_V8_v7	.long	0x11003fc2
+#  define MFVRD_R3_V1	.long	0x7c230067
+#  define VSUBUDM_V9_V8	.long	0x112944c0
+#  define VPOPCNTD_V8_V8	.long	0x110047c3
+#  define VADDUQM_V7_V8	.long	0x11274100
+#endif
+
+	.machine  power7
+
+ENTRY (__STRCASECMP)
+#ifdef USE_AS_STRNCASECMP
+	CALL_MCOUNT 3
+#else
+	CALL_MCOUNT 2
+#endif
+#define rRTN	r3	/* Return value */
+#define rSTR1	r10	/* 1st string */
+#define rSTR2	r4	/* 2nd string */
+#define rCHAR1	r6	/* Byte read from 1st string */
+#define rCHAR2	r7	/* Byte read from 2nd string */
+#define rADDR1	r8	/* Address of tolower(rCHAR1) */
+#define rADDR2	r12	/* Address of tolower(rCHAR2) */
+#define rLWR1	r8	/* Word tolower(rCHAR1) */
+#define rLWR2	r12	/* Word tolower(rCHAR2) */
+#define rTMP	r9
+#define rLOC	r11	/* Default locale address */
+
+	cmpd	cr7, rRTN, rSTR2
+
+	/* Get locale address.  */
+	ld 	rTMP, __libc_tsd_LOCALE@got@tprel(r2)
+	add 	rLOC, rTMP, __libc_tsd_LOCALE@tls
+	ld	rLOC, 0(rLOC)
+
+	mr	rSTR1, rRTN
+	li	rRTN, 0
+	beqlr	cr7
+#ifdef USE_AS_STRNCASECMP
+	cmpdi	cr7, r5, 0
+	beq	cr7, L(retnull)
+	cmpdi	cr7, r5, 16
+	blt	cr7, L(bytebybyte)
+#endif
+	vspltisb	v0, 0
+	vspltisb	v8, -1
+	/* Check for null in initial characters.
+	   Check max of 16 char depending on the alignment.
+	   If null is present, proceed byte by byte.  */
+	lvx	v4, 0, rSTR1
+#ifdef  __LITTLE_ENDIAN__
+	lvsr	v10, 0, rSTR1	/* Compute mask.  */
+	vperm	v9, v8, v4, v10	/* Mask bits that are not part of string.  */
+#else
+	lvsl	v10, 0, rSTR1
+	vperm	v9, v4, v8, v10
+#endif
+	vcmpequb.	v9, v0, v9	/* Check for null bytes.  */
+	bne	cr6, L(bytebybyte)
+	lvx	v5, 0, rSTR2
+	/* Calculate alignment.  */
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v6, 0, rSTR2
+	vperm	v9, v8, v5, v6	/* Mask bits that are not part of string.  */
+#else
+	lvsl	v6, 0, rSTR2
+	vperm	v9, v5, v8, v6
+#endif
+	vcmpequb.	v9, v0, v9	/* Check for null bytes.  */
+	bne	cr6, L(bytebybyte)
+	/* Check if locale has non ascii characters.  */
+	ld	rTMP, 0(rLOC)
+	addi r6, rTMP,LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
+	lwz	rTMP, 0(r6)
+	cmpdi	cr7, rTMP, 1
+	beq	cr7, L(bytebybyte)
+
+	/* Load vector registers with values used for TOLOWER.  */
+	/* Load v1 = 0xbf, v2 = 0x19 v3 = 0x20 in each byte.  */
+	vspltisb	v3, 2
+	vspltisb	v9, 4
+	vsl	v3, v3, v9
+	vaddubm	v1, v3, v3
+	vnor	v1, v1, v1
+	vspltisb	v2, 7
+	vsububm	v2, v3, v2
+
+	andi.	rADDR1, rSTR1, 0xF
+	beq	cr0, L(align)
+	addi	r6, rSTR1, 16
+	lvx	v9, 0, r6
+	/* Compute 16 bytes from previous two loads.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm	v4, v9, v4, v10
+#else
+	vperm	v4, v4, v9, v10
+#endif
+L(align):
+	andi.	rADDR2, rSTR2, 0xF
+	beq	cr0, L(align1)
+	addi	r6, rSTR2, 16
+	lvx	v9, 0, r6
+	/* Compute 16 bytes from previous two loads.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v9, v5, v6
+#else
+	vperm	v5, v5, v9, v6
+#endif
+L(align1):
+	CHECKNULLANDCONVERT()
+	blt	cr6, L(match)
+	b	L(different)
+	.align 	4
+L(match):
+	clrldi	r6, rSTR1, 60
+	subfic	r7, r6, 16
+#ifdef USE_AS_STRNCASECMP
+	sub	r5, r5, r7
+#endif
+	add	rSTR1, rSTR1, r7
+	add	rSTR2, rSTR2, r7
+	andi.	rADDR2, rSTR2, 0xF
+	addi	rSTR1, rSTR1, -16
+	addi	rSTR2, rSTR2, -16
+	beq	cr0, L(aligned)
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v6, 0, rSTR2
+#else
+	lvsl	v6, 0, rSTR2
+#endif
+	/* There are 2 loops depending on the input alignment.
+	   Each loop gets 16 bytes from s1 and s2, check for null,
+	   convert to lowercase and compare. Loop till difference
+	   or null occurs. */
+L(s1_align):
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+#ifdef USE_AS_STRNCASECMP
+	cmpdi	cr7, r5, 16
+	blt	cr7, L(bytebybyte)
+	addi	r5, r5, -16
+#endif
+	lvx	v4, 0, rSTR1
+	GET16BYTES(v5, rSTR2, v6)
+	CHECKNULLANDCONVERT()
+	blt	cr6, L(s1_align)
+	b	L(different)
+	.align 	4
+L(aligned):
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+#ifdef USE_AS_STRNCASECMP
+	cmpdi	cr7, r5, 16
+	blt	cr7, L(bytebybyte)
+	addi	r5, r5, -16
+#endif
+	lvx	v4, 0, rSTR1
+	lvx	v5, 0, rSTR2
+	CHECKNULLANDCONVERT()
+	blt	cr6, L(aligned)
+
+	/* Calculate and return the difference. */
+L(different):
+	vaddubm	v1, v3, v3
+	vcmpequb	v7, v0, v7
+#ifdef __LITTLE_ENDIAN__
+	/* Count trailing zero.  */
+	vspltisb	v8, -1
+	VADDUQM_V7_V8
+	vandc	v8, v9, v7
+	VPOPCNTD_V8_V8
+	vspltb	v6, v8, 15
+	vcmpequb.	v6, v6, v1
+	blt	cr6, L(shift8)
+#else
+	/* Count leading zero.  */
+	VCLZD_V8_v7
+	vspltb	v6, v8, 7
+	vcmpequb.	v6, v6, v1
+	blt	cr6, L(shift8)
+	vsro	v8, v8, v1
+#endif
+	b	L(skipsum)
+	.align  4
+L(shift8):
+	vsumsws		v8, v8, v0
+L(skipsum):
+#ifdef __LITTLE_ENDIAN__
+	/* Shift registers based on leading zero count.  */
+	vsro	v6, v5, v8
+	vsro	v7, v4, v8
+	/* Merge and move to GPR.  */
+	vmrglb	v6, v6, v7
+	vslo	v1, v6, v1
+	MFVRD_R3_V1
+	/* Place the characters that are different in first position.  */
+	sldi	rSTR2, rRTN, 56
+	srdi	rSTR2, rSTR2, 56
+	sldi	rSTR1, rRTN, 48
+	srdi	rSTR1, rSTR1, 56
+#else
+	vslo	v6, v5, v8
+	vslo	v7, v4, v8
+	vmrghb	v1, v6, v7
+	MFVRD_R3_V1
+	srdi	rSTR2, rRTN, 48
+	sldi	rSTR2, rSTR2, 56
+	srdi	rSTR2, rSTR2, 56
+	srdi	rSTR1, rRTN, 56
+#endif
+	subf  	rRTN, rSTR1, rSTR2
+	extsw 	rRTN, rRTN
+	blr
+
+	.align  4
+	/* OK. We've hit the end of the string. We need to be careful that
+	   we don't compare two strings as different because of junk beyond
+	   the end of the strings...  */
+L(null_found):
+	vaddubm	v10, v3, v3
+#ifdef __LITTLE_ENDIAN__
+	/* Count trailing zero.  */
+	vspltisb	v8, -1
+	VADDUQM_V7_V8
+	vandc	v8, v9, v7
+	VPOPCNTD_V8_V8
+	vspltb	v6, v8, 15
+	vcmpequb.	v6, v6, v10
+	blt	cr6, L(shift_8)
+#else
+	/* Count leading zero.  */
+	VCLZD_V8_v7
+	vspltb	v6, v8, 7
+	vcmpequb.	v6, v6, v10
+	blt	cr6, L(shift_8)
+	vsro	v8, v8, v10
+#endif
+	b	L(skipsum1)
+	.align  4
+L(shift_8):
+	vsumsws	v8, v8, v0
+L(skipsum1):
+	/* Calculate shift count based on count of zero.  */
+	vspltisb	v10, 7
+	vslb	v10, v10, v10
+	vsldoi	v9, v0, v10, 1
+	VSUBUDM_V9_V8
+	vspltisb	v8, 8
+	vsldoi	v8, v0, v8, 1
+	VSUBUDM_V9_V8
+	/* Shift and remove junk after null character.  */
+#ifdef __LITTLE_ENDIAN__
+	vslo	v5, v5, v9
+	vslo	v4, v4, v9
+#else
+	vsro	v5, v5, v9
+	vsro	v4, v4, v9
+#endif
+	/* Convert and compare 16 bytes.  */
+	TOLOWER()
+	blt	cr6, L(retnull)
+	b	L(different)
+	.align  4
+L(retnull):
+	li	rRTN, 0
+	blr
+	.align  4
+L(bytebybyte):
+	/* Unrolling loop for POWER: loads are done with 'lbz' plus
+	offset and string descriptors are only updated in the end
+	of loop unrolling. */
+	ld	rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
+	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
+	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
+#ifdef USE_AS_STRNCASECMP
+	rldicl	rTMP, r5, 62, 2
+	cmpdi	cr7, rTMP, 0
+	beq	cr7, L(lessthan4)
+	mtctr	rTMP
+#endif
+L(loop):
+	cmpdi	rCHAR1, 0		/* *s1 == '\0' ? */
+	sldi	rADDR1, rCHAR1, 2	/* Calculate address for tolower(*s1) */
+	sldi	rADDR2, rCHAR2, 2	/* Calculate address for tolower(*s2) */
+	lwzx	rLWR1, rLOC, rADDR1	/* Load tolower(*s1) */
+	lwzx	rLWR2, rLOC, rADDR2	/* Load tolower(*s2) */
+	cmpw	cr1, rLWR1, rLWR2	/* r = tolower(*s1) == tolower(*s2) ? */
+	crorc	4*cr1+eq,eq,4*cr1+eq	/* (*s1 != '\0') || (r == 1) */
+	beq	cr1, L(done)
+	lbz	rCHAR1, 1(rSTR1)
+	lbz	rCHAR2, 1(rSTR2)
+	cmpdi	rCHAR1, 0
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpw	cr1, rLWR1, rLWR2
+	crorc	4*cr1+eq,eq,4*cr1+eq
+	beq	cr1, L(done)
+	lbz	rCHAR1, 2(rSTR1)
+	lbz	rCHAR2, 2(rSTR2)
+	cmpdi	rCHAR1, 0
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpw	cr1, rLWR1, rLWR2
+	crorc	4*cr1+eq,eq,4*cr1+eq
+	beq	cr1, L(done)
+	lbz	rCHAR1, 3(rSTR1)
+	lbz	rCHAR2, 3(rSTR2)
+	cmpdi	rCHAR1, 0
+	/* Increment both string descriptors */
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpw	cr1, rLWR1, rLWR2
+	crorc	4*cr1+eq,eq,4*cr1+eq
+	beq     cr1, L(done)
+	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
+	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
+#ifdef USE_AS_STRNCASECMP
+	bdnz	L(loop)
+#else
+	b	L(loop)
+#endif
+#ifdef USE_AS_STRNCASECMP
+L(lessthan4):
+	clrldi	r5, r5, 62
+	cmpdi	cr7, r5, 0
+	beq	cr7, L(retnull)
+	mtctr	r5
+L(loop1):
+	cmpdi	rCHAR1, 0
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpw	cr1, rLWR1, rLWR2
+	crorc	4*cr1+eq,eq,4*cr1+eq
+	beq	cr1, L(done)
+	addi	rSTR1, rSTR1, 1
+	addi	rSTR2, rSTR2, 1
+	lbz	rCHAR1, 0(rSTR1)
+	lbz	rCHAR2, 0(rSTR2)
+	bdnz	L(loop1)
+#endif
+L(done):
+	subf	r0, rLWR2, rLWR1
+	extsw	rRTN, r0
+	blr
+END (__STRCASECMP)
+
+weak_alias (__STRCASECMP, STRCASECMP)
+libc_hidden_builtin_def (__STRCASECMP)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c
new file mode 100644
index 0000000000..0e746b7718
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c
@@ -0,0 +1,29 @@
+/* Optimized strcasestr implementation for PowerPC64/POWER8.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+
+#define STRCASESTR __strcasestr_ppc
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(__name)
+
+#undef weak_alias
+#define weak_alias(a,b)
+extern __typeof (strcasestr) __strcasestr_ppc attribute_hidden;
+
+#include <string/strcasestr.c>
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasestr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasestr.S
new file mode 100644
index 0000000000..6ac6572f3b
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasestr.S
@@ -0,0 +1,538 @@
+/* Optimized strcasestr implementation for PowerPC64/POWER8.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <locale-defines.h>
+
+/* Char * [r3] strcasestr (char *s [r3], char * pat[r4])  */
+
+/* The performance gain is obtained by comparing 16 bytes.  */
+
+/* When the first char of r4 is hit ITERATIONS times in r3
+   fallback to default.  */
+#define ITERATIONS	64
+
+#ifndef STRCASESTR
+# define STRCASESTR __strcasestr
+#endif
+
+#ifndef STRLEN
+/* For builds without IFUNC support, local calls should be made to internal
+   GLIBC symbol (created by libc_hidden_builtin_def).  */
+# ifdef SHARED
+#  define STRLEN   __GI_strlen
+# else
+#  define STRLEN   strlen
+# endif
+#endif
+
+#ifndef STRNLEN
+/* For builds without IFUNC support, local calls should be made to internal
+   GLIBC symbol (created by libc_hidden_builtin_def).  */
+# ifdef SHARED
+#  define STRNLEN   __GI_strnlen
+# else
+#  define STRNLEN    __strnlen
+# endif
+#endif
+
+#ifndef STRCHR
+# ifdef SHARED
+#  define STRCHR   __GI_strchr
+# else
+#  define STRCHR   strchr
+# endif
+#endif
+
+/* Convert 16 bytes of v4 and reg to lowercase and compare.  */
+#define TOLOWER(reg)     \
+	vcmpgtub	v6, v4, v1; \
+	vcmpgtub	v7, v2, v4; \
+	vand	v8, v7, v6; \
+	vand	v8, v8, v3; \
+	vor	v4, v8, v4; \
+	vcmpgtub	v6, reg, v1; \
+	vcmpgtub	v7, v2, reg; \
+	vand	v8, v7, v6; \
+	vand	v8, v8, v3; \
+	vor	reg, v8, reg; \
+	vcmpequb.	v6, reg, v4;
+
+/* TODO: change these to the actual instructions when the minimum required
+   binutils allows it.  */
+#ifdef _ARCH_PWR8
+#define VCLZD_V8_v7	vclzd	v8, v7;
+#else
+#define VCLZD_V8_v7	.long	0x11003fc2
+#endif
+
+#define	FRAMESIZE	(FRAME_MIN_SIZE+48)
+/* TODO: change this to .machine power8 when the minimum required binutils
+   allows it.  */
+	.machine  power7
+EALIGN (STRCASESTR, 4, 0)
+	CALL_MCOUNT 2
+	mflr	r0			/* Load link register LR to r0.  */
+	std	r31, -8(r1)		/* Save callers register r31.  */
+	std	r30, -16(r1)		/* Save callers register r30.  */
+	std	r29, -24(r1)		/* Save callers register r29.  */
+	std	r28, -32(r1)		/* Save callers register r28.  */
+	std	r27, -40(r1)		/* Save callers register r27.  */
+	std	r0, 16(r1)		/* Store the link register.  */
+	cfi_offset(r31, -8)
+	cfi_offset(r30, -16)
+	cfi_offset(r29, -24)
+	cfi_offset(r28, -32)
+	cfi_offset(r27, -40)
+	cfi_offset(lr, 16)
+	stdu	r1, -FRAMESIZE(r1)	/* Create the stack frame.  */
+	cfi_adjust_cfa_offset(FRAMESIZE)
+
+	dcbt	0, r3
+	dcbt	0, r4
+	cmpdi	cr7, r3, 0		/* Input validation.  */
+	beq	cr7, L(retnull)
+	cmpdi	cr7, r4, 0
+	beq	cr7, L(retnull)
+
+	mr	r29, r3
+	mr	r30, r4
+	/* Load first byte from r4 and check if its null.  */
+	lbz	r6, 0(r4)
+	cmpdi	cr7, r6, 0
+	beq	cr7, L(ret_r3)
+
+	ld	r10, __libc_tsd_LOCALE@got@tprel(r2)
+	add	r9, r10, __libc_tsd_LOCALE@tls
+	ld	r9, 0(r9)
+	ld	r9, LOCALE_CTYPE_TOUPPER(r9)
+	sldi	r10, r6, 2		/* Convert to upper case.  */
+	lwzx	r28, r9, r10
+
+	ld	r10, __libc_tsd_LOCALE@got@tprel(r2)
+	add	r11, r10, __libc_tsd_LOCALE@tls
+	ld	r11, 0(r11)
+	ld	r11, LOCALE_CTYPE_TOLOWER(r11)
+	sldi	r10, r6, 2              /* Convert to lower case.  */
+	lwzx	r27, r11, r10
+
+	/* Check if the first char is present.  */
+	mr	r4, r27
+	bl	STRCHR
+	nop
+	mr	r5, r3
+	mr	r3, r29
+	mr	r29, r5
+	mr	r4, r28
+	bl	STRCHR
+	nop
+	cmpdi	cr7, r29, 0
+	beq	cr7, L(firstpos)
+	cmpdi	cr7, r3, 0
+	beq	cr7, L(skipcheck)
+	cmpw	cr7, r3, r29
+	ble 	cr7, L(firstpos)
+	/* Move r3 to the first occurence.  */
+L(skipcheck):
+	mr	r3, r29
+L(firstpos):
+	mr	r29, r3
+
+	sldi	r9, r27, 8
+	or	r28, r9, r28
+	/* Reg r27 is used to count the number of iterations.  */
+	li	r27, 0
+	/* If first char of search str is not present.  */
+	cmpdi	cr7, r3, 0
+	ble	cr7, L(end)
+
+	/* Find the length of pattern.  */
+	mr	r3, r30
+	bl	STRLEN
+	nop
+
+	cmpdi	cr7, r3, 0	/* If search str is null.  */
+	beq	cr7, L(ret_r3)
+
+	mr	r31, r3
+	mr	r4, r3
+	mr	r3, r29
+	bl	STRNLEN
+	nop
+
+	cmpd	cr7, r3, r31 	/* If len(r3) < len(r4).  */
+	blt	cr7, L(retnull)
+
+	mr	r3, r29
+
+	/* Locales not matching ASCII for single bytes.  */
+	ld	r10, __libc_tsd_LOCALE@got@tprel(r2)
+	add	r9, r10, __libc_tsd_LOCALE@tls
+	ld	r9, 0(r9)
+	ld	r7, 0(r9)
+	addi	r7, r7, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
+	lwz	r8, 0(r7)
+	cmpdi	cr7, r8, 1
+	beq	cr7, L(bytebybyte)
+
+	/* If len(r4) < 16 handle byte by byte.  */
+	/* For shorter strings we will not use vector registers.  */
+	cmpdi	cr7, r31, 16
+	blt	cr7, L(bytebybyte)
+
+	/* Comparison values used for TOLOWER.  */
+	/* Load v1 = 64('A' - 1), v2 = 91('Z' + 1), v3 = 32 in each byte.  */
+	vspltish	v0, 0
+	vspltisb	v5, 2
+	vspltisb	v4, 4
+	vsl	v3, v5, v4
+	vaddubm	v1, v3, v3
+	vspltisb	v5, 15
+	vaddubm	v2, v5, v5
+	vaddubm	v2, v1, v2
+	vspltisb	v4, -3
+	vaddubm	v2, v2, v4
+
+	/*
+	1. Load 16 bytes from r3 and r4
+	2. Check if there is null, If yes, proceed byte by byte path.
+	3. Else,Convert both to lowercase and compare.
+	4. If they are same proceed to 1.
+	5. If they dont match, find if first char of r4 is present in the
+	   loaded 16 byte of r3.
+	6. If yes, move position, load next 16 bytes of r3 and proceed to 2.
+	*/
+
+	mr	r8, r3		/* Save r3 for future use.  */
+	mr	r4, r30		/* Restore r4.  */
+	clrldi	r10, r4, 60
+	lvx	v5, 0, r4	/* Load 16 bytes from r4.  */
+	cmpdi	cr7, r10, 0
+	beq	cr7, L(begin2)
+	/* If r4 is unaligned, load another 16 bytes.  */
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v7, 0, r4
+#else
+	lvsl	v7, 0, r4
+#endif
+	addi	r5, r4, 16
+	lvx	v9, 0, r5
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v9, v5, v7
+#else
+	vperm	v5, v5, v9, v7
+#endif
+L(begin2):
+	lvx	v4, 0, r3
+	vcmpequb.	v7, v0, v4	/* Check for null.  */
+	beq	cr6, L(nullchk6)
+	b	L(trailcheck)
+
+        .align  4
+L(nullchk6):
+	clrldi	r10, r3, 60
+	cmpdi	cr7, r10, 0
+	beq	cr7, L(next16)
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v7, 0, r3
+#else
+	lvsl	v7, 0, r3
+#endif
+	addi	r5, r3, 16
+	/* If r3 is unaligned, load another 16 bytes.  */
+	lvx	v10, 0, r5
+#ifdef __LITTLE_ENDIAN__
+	vperm	v4, v10, v4, v7
+#else
+	vperm	v4, v4, v10, v7
+#endif
+L(next16):
+	vcmpequb.	v6, v0, v5	/* Check for null.  */
+	beq	cr6, L(nullchk)
+	b	L(trailcheck)
+
+	.align	4
+L(nullchk):
+	vcmpequb.	v6, v0, v4
+	beq	cr6, L(nullchk1)
+	b	L(retnull)
+
+	.align	4
+L(nullchk1):
+	/* Convert both v3 and v4 to lower.  */
+	TOLOWER(v5)
+	/* If both are same, branch to match.  */
+	blt	cr6, L(match)
+	/* Find if the first char is present in next 15 bytes.  */
+#ifdef __LITTLE_ENDIAN__
+	vspltb	v6, v5, 15
+	vsldoi	v7, v0, v4, 15
+#else
+	vspltb	v6, v5, 0
+	vspltisb	v7, 8
+	vslo	v7, v4, v7
+#endif
+	vcmpequb	v7, v6, v7
+	vcmpequb.	v6, v0, v7
+	/* Shift r3 by 16 bytes and proceed.  */
+	blt	cr6, L(shift16)
+	VCLZD_V8_v7
+#ifdef __LITTLE_ENDIAN__
+	vspltb	v6, v8, 15
+#else
+	vspltb	v6, v8, 7
+#endif
+	vcmpequb.	v6, v6, v1
+	/* Shift r3 by 8  bytes and proceed.  */
+	blt	cr6, L(shift8)
+	b	L(begin)
+
+	.align	4
+L(match):
+	/* There is a match of 16 bytes, check next bytes.  */
+	cmpdi	cr7, r31, 16
+	mr	r29, r3
+	beq	cr7, L(ret_r3)
+
+L(secondmatch):
+	addi	r3, r3, 16
+	addi	r4, r4, 16
+	/* Load next 16 bytes of r3 and r4 and compare.  */
+	clrldi	r10, r4, 60
+	cmpdi	cr7, r10, 0
+	beq	cr7, L(nextload)
+	/* Handle unaligned case.  */
+	vor	v6, v9, v9
+	vcmpequb.	v7, v0, v6
+	beq	cr6, L(nullchk2)
+	b	L(trailcheck)
+
+	.align	4
+L(nullchk2):
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v7, 0, r4
+#else
+	lvsl	v7, 0, r4
+#endif
+	addi	r5, r4, 16
+	/* If r4 is unaligned, load another 16 bytes.  */
+	lvx	v9, 0, r5
+#ifdef __LITTLE_ENDIAN__
+	vperm	v11, v9, v6, v7
+#else
+	vperm	v11, v6, v9, v7
+#endif
+	b	L(compare)
+
+	.align	4
+L(nextload):
+	lvx	v11, 0, r4
+L(compare):
+	vcmpequb.	v7, v0, v11
+	beq	cr6, L(nullchk3)
+	b	L(trailcheck)
+
+	.align	4
+L(nullchk3):
+	clrldi	r10, r3, 60
+	cmpdi 	cr7, r10, 0
+	beq 	cr7, L(nextload1)
+	/* Handle unaligned case.  */
+	vor	v4, v10, v10
+	vcmpequb.	v7, v0, v4
+	beq	cr6, L(nullchk4)
+	b	L(retnull)
+
+	.align	4
+L(nullchk4):
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v7, 0, r3
+#else
+	lvsl	v7, 0, r3
+#endif
+	addi	r5, r3, 16
+	/* If r3 is unaligned, load another 16 bytes.  */
+	lvx	v10, 0, r5
+#ifdef __LITTLE_ENDIAN__
+	vperm	v4, v10, v4, v7
+#else
+	vperm	v4, v4, v10, v7
+#endif
+	b	L(compare1)
+
+	.align	4
+L(nextload1):
+	lvx	v4, 0, r3
+L(compare1):
+	vcmpequb.	v7, v0, v4
+	beq	cr6, L(nullchk5)
+	b	L(retnull)
+
+	.align	4
+L(nullchk5):
+	/* Convert both v3 and v4 to lower.  */
+	TOLOWER(v11)
+	/* If both are same, branch to secondmatch.  */
+	blt 	cr6, L(secondmatch)
+	/* Continue the search.  */
+        b	L(begin)
+
+	.align	4
+L(trailcheck):
+	ld	r10, __libc_tsd_LOCALE@got@tprel(r2)
+	add	r11, r10, __libc_tsd_LOCALE@tls
+	ld	r11, 0(r11)
+	ld	r11, LOCALE_CTYPE_TOLOWER(r11)
+L(loop2):
+	lbz	r5, 0(r3)               /* Load byte from r3.  */
+	lbz	r6, 0(r4)               /* Load next byte from r4.  */
+	cmpdi 	cr7, r6, 0              /* Is it null?  */
+	beq 	cr7, L(updater3)
+	cmpdi 	cr7, r5, 0              /* Is it null?  */
+	beq 	cr7, L(retnull)         /* If yes, return.  */
+	addi	r3, r3, 1
+	addi	r4, r4, 1               /* Increment r4.  */
+	sldi	r10, r5, 2              /* Convert to lower case.  */
+	lwzx	r10, r11, r10
+	sldi	r7, r6, 2               /* Convert to lower case.  */
+	lwzx	r7, r11, r7
+	cmpw	cr7, r7, r10            /* Compare with byte from r4.  */
+	bne	cr7, L(begin)
+	b	L(loop2)
+
+	.align	4
+L(shift8):
+	addi	r8, r8, 7
+	b	L(begin)
+	.align	4
+L(shift16):
+	addi	r8, r8, 15
+	.align	4
+L(begin):
+	addi	r8, r8, 1
+	mr	r3, r8
+	/* When our iterations exceed ITERATIONS,fall back to default.  */
+	addi	r27, r27, 1
+	cmpdi	cr7, r27, ITERATIONS
+	beq	cr7, L(default)
+	mr	r4, r30         /* Restore r4.  */
+	b	L(begin2)
+
+	/* Handling byte by byte.  */
+	.align	4
+L(loop1):
+	mr	r3, r8
+	addi	r27, r27, 1
+	cmpdi	cr7, r27, ITERATIONS
+	beq	cr7, L(default)
+	mr	r29, r8
+	srdi	r4, r28, 8
+	/* Check if the first char is present.  */
+	bl	STRCHR
+	nop
+	mr	r5, r3
+	mr	r3, r29
+	mr	r29, r5
+	sldi	r4, r28, 56
+	srdi	r4, r4, 56
+	bl	STRCHR
+	nop
+	cmpdi	cr7, r29, 0
+	beq	cr7, L(nextpos)
+	cmpdi	cr7, r3, 0
+	beq	cr7, L(skipcheck1)
+	cmpw	cr7, r3, r29
+	ble 	cr7, L(nextpos)
+	/* Move r3 to first occurence.  */
+L(skipcheck1):
+	mr	r3, r29
+L(nextpos):
+	mr	r29, r3
+	cmpdi 	cr7, r3, 0
+	ble 	cr7, L(retnull)
+L(bytebybyte):
+	ld	r10, __libc_tsd_LOCALE@got@tprel(r2)
+	add	r11, r10, __libc_tsd_LOCALE@tls
+	ld	r11, 0(r11)
+	ld	r11, LOCALE_CTYPE_TOLOWER(r11)
+	mr	r4, r30                 /* Restore r4.  */
+	mr	r8, r3                  /* Save r3.  */
+	addi	r8, r8, 1
+
+L(loop):
+	addi	r3, r3, 1
+	lbz	r5, 0(r3)               /* Load byte from r3.  */
+	addi	r4, r4, 1               /* Increment r4.  */
+	lbz	r6, 0(r4)               /* Load next byte from r4.  */
+	cmpdi 	cr7, r6, 0              /* Is it null?  */
+	beq 	cr7, L(updater3)
+	cmpdi 	cr7, r5, 0              /* Is it null?  */
+	beq 	cr7, L(retnull)         /* If yes, return.  */
+	sldi	r10, r5, 2              /* Convert to lower case.  */
+	lwzx	r10, r11, r10
+	sldi	r7, r6, 2               /* Convert to lower case.  */
+	lwzx	r7, r11, r7
+	cmpw	cr7, r7, r10            /* Compare with byte from r4.  */
+	bne 	cr7, L(loop1)
+	b	L(loop)
+
+	/* Handling return values.  */
+	.align	4
+L(updater3):
+	subf	r3, r31, r3	/* Reduce r31 (len of r4) from r3.  */
+	b	L(end)
+
+	.align	4
+L(ret_r3):
+	mr	r3, r29		/* Return point of match.  */
+	b	L(end)
+
+	.align	4
+L(retnull):
+	li	r3, 0		/* Substring was not found.  */
+	b	L(end)
+
+	.align	4
+L(default):
+	mr	r4, r30
+	bl	__strcasestr_ppc
+	nop
+
+	.align	4
+L(end):
+	addi	r1, r1, FRAMESIZE	/* Restore stack pointer.  */
+	cfi_adjust_cfa_offset(-FRAMESIZE)
+	ld	r0, 16(r1)	/* Restore the saved link register.  */
+	ld	r27, -40(r1)
+	ld	r28, -32(r1)
+	ld	r29, -24(r1)	/* Restore callers save register r29.  */
+	ld	r30, -16(r1)	/* Restore callers save register r30.  */
+	ld	r31, -8(r1)	/* Restore callers save register r31.  */
+	cfi_restore(lr)
+	cfi_restore(r27)
+	cfi_restore(r28)
+	cfi_restore(r29)
+	cfi_restore(r30)
+	cfi_restore(r31)
+	mtlr	r0		/* Branch to link register.  */
+	blr
+END (STRCASESTR)
+
+weak_alias (__strcasestr, strcasestr)
+libc_hidden_def (__strcasestr)
+libc_hidden_builtin_def (strcasestr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strchr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strchr.S
new file mode 100644
index 0000000000..e0c185c162
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strchr.S
@@ -0,0 +1,377 @@
+/* Optimized strchr implementation for PowerPC64/POWER8.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STRCHRNUL
+# ifndef STRCHRNUL
+#   define FUNC_NAME __strchrnul
+# else
+#   define FUNC_NAME STRCHRNUL
+# endif
+#else
+# ifndef STRCHR
+#  define FUNC_NAME strchr
+# else
+#  define FUNC_NAME STRCHR
+# endif
+#endif  /* !USE_AS_STRCHRNUL  */
+
+/* int [r3] strchr (char *s [r3], int c [r4])  */
+/* TODO: change these to the actual instructions when the minimum required
+   binutils allows it.  */
+#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t,a,b)  .long (0x1000054c \
+			| ((t)<<(32-11)) \
+			| ((a)<<(32-16)) \
+			| ((b)<<(32-21)) )
+/* TODO: change this to .machine power8 when the minimum required binutils
+   allows it.  */
+	.machine  power7
+ENTRY (FUNC_NAME)
+	CALL_MCOUNT 2
+	dcbt	0,r3
+	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
+	cmpdi	cr7,r4,0
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
+	li	r0,0	      /* Doubleword with null chars to use
+				 with cmpb.  */
+
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+
+	beq	cr7,L(null_match)
+
+	/* Replicate byte to doubleword.  */
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
+	insrdi  r4,r4,32,0
+
+	/* Now r4 has a doubleword of c bytes and r0 has
+	   a doubleword of null bytes.  */
+
+	cmpb	r10,r12,r4     /* Compare each byte against c byte.  */
+	cmpb	r11,r12,r0     /* Compare each byte against null byte.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+#else
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+#endif
+	or	r5,r10,r11    /* OR the results to speed things up.  */
+	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done)
+
+	mtcrf   0x01,r8
+
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb    r10,r12,r4
+	cmpb	r11,r12,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done)
+	b	L(loop)	      /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	.p2align  5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu	r9,16(r8)
+	cmpb	r10,r12,r4
+	cmpb	r11,r12,r0
+	cmpb	r6,r9,r4
+	cmpb	r7,r9,r0
+	or	r5,r10,r11
+	or	r9,r6,r7
+	or	r12,r5,r9
+	cmpdi	cr7,r12,0
+	beq	cr7,L(vector)
+	/* OK, one (or both) of the doublewords contains a c/null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a c/null byte.  */
+
+	cmpdi	cr6,r5,0
+	addi	r8,r8,-8
+	bne	cr6,L(done)
+
+	/* The c/null byte must be in the second doubleword.  Adjust the
+	   address again and move the result of cmpb to r10 so we can calculate
+	   the pointer.  */
+
+	mr	r10,r6
+	mr	r11,r7
+	addi	r8,r8,8
+#ifdef USE_AS_STRCHRNUL
+	mr	r5, r9
+#endif
+	/* r10/r11 have the output of the cmpb instructions, that is,
+	   0xff in the same position as the c/null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+L(done):
+#ifdef USE_AS_STRCHRNUL
+	mr	r10, r5
+#endif
+#ifdef __LITTLE_ENDIAN__
+	addi    r3,r10,-1
+	andc    r3,r3,r10
+	popcntd	r0,r3
+# ifndef USE_AS_STRCHRNUL
+	addi    r4,r11,-1
+	andc    r4,r4,r11
+	cmpld	cr7,r3,r4
+	bgt	cr7,L(no_match)
+# endif
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before c matches.  */
+# ifndef USE_AS_STRCHRNUL
+	cmpld	cr7,r11,r10
+	bgt	cr7,L(no_match)
+# endif
+#endif
+	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of the matching c byte
+				 or null in case c was not found.  */
+	blr
+
+	/* Check the first 32B in GPR's and move to vectorized loop.  */
+	.p2align  5
+L(vector):
+	addi	r3, r8, 8
+	andi.	r10, r3, 31
+	bne	cr0, L(loop)
+	vspltisb	v0, 0
+	/* Precompute vbpermq constant.  */
+	vspltisb	v10, 3
+	lvsl	v11, r0, r0
+	vslb	v10, v11, v10
+	MTVRD(v1,r4)
+	li	r5, 16
+	vspltb	v1, v1, 7
+	/* Compare 32 bytes in each loop.  */
+L(continue):
+	lvx	v4, 0, r3
+	lvx	v5, r3, r5
+	vcmpequb	v2, v0, v4
+	vcmpequb	v3, v0, v5
+	vcmpequb	v6, v1, v4
+	vcmpequb	v7, v1, v5
+	vor	v8, v2, v3
+	vor	v9, v6, v7
+	vor	v11, v8, v9
+	vcmpequb.	v11, v0, v11
+	addi	r3, r3, 32
+	blt	cr6, L(continue)
+	/* One (or both) of the quadwords contains a c/null byte.  */
+	addi	r3, r3, -32
+#ifndef USE_AS_STRCHRNUL
+	vcmpequb.	v11, v0, v9
+	blt	cr6, L(no_match)
+#endif
+	/* Permute the first bit of each byte into bits 48-63.  */
+	VBPERMQ(v2, v2, v10)
+	VBPERMQ(v3, v3, v10)
+	VBPERMQ(v6, v6, v10)
+	VBPERMQ(v7, v7, v10)
+	/* Shift each component into its correct position for merging.  */
+#ifdef __LITTLE_ENDIAN__
+	vsldoi	v3, v3, v3, 2
+	vsldoi	v7, v7, v7, 2
+#else
+	vsldoi	v2, v2, v2, 6
+	vsldoi	v3, v3, v3, 4
+	vsldoi	v6, v6, v6, 6
+	vsldoi	v7, v7, v7, 4
+#endif
+
+        /* Merge the results and move to a GPR.  */
+        vor     v1, v3, v2
+        vor     v2, v6, v7
+        vor     v4, v1, v2
+	MFVRD(r5, v4)
+#ifdef __LITTLE_ENDIAN__
+	addi	r6, r5, -1
+	andc	r6, r6, r5
+	popcntd	r6, r6
+#else
+	cntlzd	r6, r5	/* Count leading zeros before the match.  */
+#endif
+	add	r3, r3, r6	/* Compute final length.  */
+	/* Return NULL if null found before c.  */
+#ifndef USE_AS_STRCHRNUL
+	lbz	r4, 0(r3)
+	cmpdi	cr7, r4, 0
+	beq	cr7, L(no_match)
+#endif
+	blr
+
+#ifndef USE_AS_STRCHRNUL
+	.align	4
+L(no_match):
+	li	r3,0
+	blr
+#endif
+
+/* We are here because strchr was called with a null byte.  */
+	.align	4
+L(null_match):
+	/* r0 has a doubleword of null bytes.  */
+
+	cmpb	r5,r12,r0     /* Compare each byte against null bytes.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r5,r5,r6
+	sld	r5,r5,r6
+#else
+	sld	r5,r5,r6
+	srd	r5,r5,r6
+#endif
+	cmpdi	cr7,r5,0      /* If r10 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done_null)
+
+	mtcrf   0x01,r8
+
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop_null)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb    r5,r12,r0
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done_null)
+	b	L(loop_null)  /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	/* Main loop to look for the end of the string.  Since it's a
+	   small loop (< 8 instructions), align it to 32-bytes.  */
+	.p2align  5
+L(loop_null):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu     r11,16(r8)
+	cmpb	r5,r12,r0
+	cmpb	r10,r11,r0
+	or	r6,r5,r10
+	cmpdi	cr7,r6,0
+	beq	cr7,L(vector1)
+
+	/* OK, one (or both) of the doublewords contains a null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a null byte.  */
+
+	cmpdi	cr6,r5,0
+	addi	r8,r8,-8
+	bne	cr6,L(done_null)
+
+	/* The null byte must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   pointer.  */
+
+	mr	r5,r10
+	addi	r8,r8,8
+
+	/* r5 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+L(done_null):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntd	r0,r0
+#else
+	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
+#endif
+	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of the matching null byte.  */
+	blr
+	.p2align  5
+L(vector1):
+	addi    r3, r8, 8
+	andi.	r10, r3, 31
+	bne	cr0, L(loop_null)
+	vspltisb	v8, -1
+	vspltisb	v0, 0
+	vspltisb	v10, 3
+	lvsl	v11, r0, r0
+	vslb	v10, v11, v10
+	li	r5, 16
+L(continue1):
+	lvx	v4, 0, r3
+	lvx	v5, r3, r5
+	vcmpequb	v2, v0, v4
+	vcmpequb	v3, v0, v5
+	vor	v8, v2, v3
+	vcmpequb.	v11, v0, v8
+	addi	r3, r3, 32
+	blt	cr6, L(continue1)
+	addi	r3, r3, -32
+L(end1):
+	VBPERMQ(v2, v2, v10)
+	VBPERMQ(v3, v3, v10)
+	/* Shift each component into its correct position for merging.  */
+#ifdef __LITTLE_ENDIAN__
+	vsldoi	v3, v3, v3, 2
+#else
+	vsldoi	v2, v2, v2, 6
+	vsldoi	v3, v3, v3, 4
+#endif
+
+        /* Merge the results and move to a GPR.  */
+        vor     v4, v3, v2
+	MFVRD(r5, v4)
+#ifdef __LITTLE_ENDIAN__
+	addi	r6, r5, -1
+	andc	r6, r6, r5
+	popcntd	r6, r6
+#else
+	cntlzd	r6, r5	/* Count leading zeros before the match.  */
+#endif
+	add	r3, r3, r6	/* Compute final length.  */
+	blr
+END (FUNC_NAME)
+
+#ifndef USE_AS_STRCHRNUL
+weak_alias (strchr, index)
+libc_hidden_builtin_def (strchr)
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strchrnul.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strchrnul.S
new file mode 100644
index 0000000000..3bf4b275dd
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strchrnul.S
@@ -0,0 +1,23 @@
+/* Optimized strchrnul implementation for PowerPC64/POWER8.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STRCHRNUL 1
+#include <sysdeps/powerpc/powerpc64/power8/strchr.S>
+
+weak_alias (__strchrnul,strchrnul)
+libc_hidden_builtin_def (__strchrnul)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcmp.S
new file mode 100644
index 0000000000..770484f1e1
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcmp.S
@@ -0,0 +1,247 @@
+/* Optimized strcmp implementation for PowerPC64/POWER8.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifndef STRCMP
+# define STRCMP strcmp
+#endif
+
+/* Implements the function
+
+   size_t [r3] strcmp (const char *s1 [r3], const char *s2 [r4])
+
+   The implementation uses unaligned doubleword access to avoid specialized
+   code paths depending of data alignment.  Although recent powerpc64 uses
+   64K as default, the page cross handling assumes minimum page size of
+   4k.  */
+
+EALIGN (STRCMP, 4, 0)
+	li	r0,0
+
+	/* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
+	   the code:
+
+	    (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
+
+	   with PAGE_SIZE being 4096 and ITER_SIZE begin 16.  */
+
+	rldicl	r7,r3,0,52
+	rldicl	r9,r4,0,52
+	cmpldi	cr7,r7,4096-16
+	bgt	cr7,L(pagecross_check)
+	cmpldi	cr5,r9,4096-16
+	bgt	cr5,L(pagecross_check)
+
+	/* For short string up to 16 bytes, load both s1 and s2 using
+	   unaligned dwords and compare.  */
+	ld	r8,0(r3)
+	ld	r10,0(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	ld	r8,8(r3)
+	ld	r10,8(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	addi	r7,r3,16
+	addi	r4,r4,16
+
+L(align_8b):
+	/* Now it has checked for first 16 bytes, align source1 to doubleword
+	   and adjust source2 address.  */
+	rldicl	r9,r7,0,61	/* source1 alignment to doubleword  */
+	subf	r4,r9,r4	/* Adjust source2 address based on source1
+				   alignment.  */
+	rldicr	r7,r7,0,60	/* Align source1 to doubleword.  */
+
+	/* At this point, source1 alignment is 0 and source2 alignment is
+	   between 0 and 7.  Check is source2 alignment is 0, meaning both
+	   sources have the same alignment.  */
+	andi.	r9,r4,0x7
+	bne	cr0,L(loop_diff_align)
+
+	/* If both source1 and source2 are doubleword aligned, there is no
+	   need for page boundary cross checks.  */
+
+	ld	r8,0(r7)
+	ld	r10,0(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	.align 4
+L(loop_equal_align):
+	ld	r8,8(r7)
+	ld	r10,8(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	ld	r8,16(r7)
+	ld	r10,16(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	ldu	r8,24(r7)
+	ldu	r10,24(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	b	L(loop_equal_align)
+
+	/* A zero byte was found in r8 (s1 dword), r9 contains the cmpb
+	   result and r10 the dword from s2.  To code isolate the byte
+	   up to end (including the '\0'), masking with 0xFF the remaining
+	   ones:
+
+           #if __LITTLE_ENDIAN__
+	     (__builtin_ffsl (x) - 1) = counting trailing zero bits
+	     r9 = (__builtin_ffsl (r9) - 1) + 8;
+	     r9 = -1UL << r9
+	   #else
+	     r9  = __builtin_clzl (r9) + 8;
+	     r9  = -1UL >> r9
+	   #endif
+	     r8  = r8  | r9
+	     r10 = r10 | r9  */
+
+#ifdef __LITTLE_ENDIAN__
+	nor 	r9,r9,r9
+L(different_nocmpb):
+	neg	r3,r9
+	and	r9,r9,r3
+	cntlzd	r9,r9
+	subfic	r9,r9,63
+#else
+	not	r9,r9
+L(different_nocmpb):
+	cntlzd	r9,r9
+	subfic	r9,r9,56
+#endif
+	srd	r3,r8,r9
+	srd	r10,r10,r9
+	rldicl	r10,r10,0,56
+	rldicl	r3,r3,0,56
+	subf	r3,r10,r3
+	extsw	r3,r3
+	blr
+
+	.align	4
+L(pagecross_check):
+	subfic	r9,r9,4096
+	subfic	r7,r7,4096
+	cmpld	cr7,r7,r9
+	bge	cr7,L(pagecross)
+	mr	r7,r9
+
+	/* If unaligned 16 bytes reads across a 4K page boundary, it uses
+	   a simple byte a byte comparison until the page alignment for s1
+	   is reached.  */
+L(pagecross):
+	add	r7,r3,r7
+	subf	r9,r3,r7
+	mtctr	r9
+
+	.align	4
+L(pagecross_loop):
+	/* Loads a byte from s1 and s2, compare if *s1 is equal to *s2
+	   and if *s1 is '\0'.  */
+	lbz	r9,0(r3)
+	lbz	r10,0(r4)
+	addi	r3,r3,1
+	addi	r4,r4,1
+	cmplw	cr7,r9,r10
+	cmpdi	cr5,r9,r0
+	bne	cr7,L(pagecross_ne)
+	beq	cr5,L(pagecross_nullfound)
+	bdnz	L(pagecross_loop)
+	b	L(align_8b)
+
+	.align	4
+	/* The unaligned read of source2 will cross a 4K page boundary,
+	   and the different byte or NULL maybe be in the remaining page
+	   bytes. Since it can not use the unaligned load, the algorithm
+	   reads and compares 8 bytes to keep source1 doubleword aligned.  */
+L(check_source2_byte):
+	li	r9,8
+	mtctr	r9
+
+	.align	4
+L(check_source2_byte_loop):
+	lbz	r9,0(r7)
+	lbz	r10,0(r4)
+	addi	r7,r7,1
+	addi	r4,r4,1
+	cmplw	cr7,r9,10
+	cmpdi	r5,r9,0
+	bne	cr7,L(pagecross_ne)
+	beq	cr5,L(pagecross_nullfound)
+	bdnz	L(check_source2_byte_loop)
+
+	/* If source2 is unaligned to doubleword, the code needs to check
+	   on each interation if the unaligned doubleword access will cross
+	   a 4k page boundary.  */
+	.align	5
+L(loop_unaligned):
+	ld	r8,0(r7)
+	ld	r10,0(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+	addi	r7,r7,8
+	addi	r4,r4,8
+
+L(loop_diff_align):
+	/* Check if [src2]+8 cross a 4k page boundary:
+
+	     srcin2 % PAGE_SIZE > (PAGE_SIZE - 8)
+
+	     with PAGE_SIZE being 4096.  */
+	rldicl	r9,r4,0,52
+	cmpldi	cr7,r9,4088
+	ble	cr7,L(loop_unaligned)
+	b	L(check_source2_byte)
+
+	.align	4
+L(pagecross_ne):
+	extsw	r3,r9
+	mr	r9,r10
+L(pagecross_retdiff):
+	subf	r9,r9,r3
+	extsw	r3,r9
+	blr
+
+	.align	4
+L(pagecross_nullfound):
+	li	r3,0
+	b	L(pagecross_retdiff)
+END (STRCMP)
+libc_hidden_builtin_def (strcmp)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcpy.S
new file mode 100644
index 0000000000..7f2cee4b1b
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcpy.S
@@ -0,0 +1,270 @@
+/* Optimized strcpy/stpcpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STPCPY
+# ifndef STPCPY
+#   define FUNC_NAME __stpcpy
+# else
+#   define FUNC_NAME STPCPY
+# endif
+#else
+# ifndef STRCPY
+#  define FUNC_NAME strcpy
+# else
+#  define FUNC_NAME STRCPY
+# endif
+#endif  /* !USE_AS_STPCPY  */
+
+/* Implements the function
+
+   char * [r3] strcpy (char *dest [r3], const char *src [r4])
+
+   or
+
+   char * [r3] stpcpy (char *dest [r3], const char *src [r4])
+
+   if USE_AS_STPCPY is defined.
+
+   The implementation uses unaligned doubleword access to avoid specialized
+   code paths depending of data alignment.  Although recent powerpc64 uses
+   64K as default, the page cross handling assumes minimum page size of
+   4k.  */
+
+	.machine  power7
+EALIGN (FUNC_NAME, 4, 0)
+        li      r0,0          /* Doubleword with null chars to use
+                                 with cmpb.  */
+
+	/* Check if the [src]+15 will cross a 4K page by checking if the bit
+	   indicating the page size changes.  Basically:
+
+	   uint64_t srcin = (uint64_t)src;
+	   uint64_t ob = srcin & 4096UL;
+	   uint64_t nb = (srcin+15UL) & 4096UL;
+	   if (ob ^ nb)
+	     goto pagecross;  */
+
+	addi	r9,r4,15
+	xor	r9,r9,r4
+	rlwinm.	r9,r9,0,19,19
+	bne	L(pagecross)
+
+	/* For short string (less than 16 bytes), just calculate its size as
+	   strlen and issues a memcpy if null is found.  */
+	mr	r7,r4
+        ld      r12,0(r7)     /* Load doubleword from memory.  */
+        cmpb    r10,r12,r0    /* Check for null bytes in DWORD1.  */
+        cmpdi   cr7,r10,0     /* If r10 == 0, no null's have been found.  */
+        bne     cr7,L(done)
+
+        ldu     r8,8(r7)
+        cmpb    r10,r8,r0
+        cmpdi   cr7,r10,0
+        bne     cr7,L(done)
+
+	b	L(loop_before)
+
+	.align	4
+L(pagecross):
+	clrrdi  r7,r4,3       /* Align the address to doubleword boundary.  */
+	rlwinm  r6,r4,3,26,28 /* Calculate padding.  */
+	li      r5,-1         /* MASK = 0xffffffffffffffff.  */
+        ld      r12,0(r7)     /* Load doubleword from memory.  */
+#ifdef __LITTLE_ENDIAN__
+        sld     r5,r5,r6
+#else
+        srd     r5,r5,r6      /* MASK = MASK >> padding.  */
+#endif
+        orc     r9,r12,r5     /* Mask bits that are not part of the string.  */
+        cmpb    r10,r9,r0     /* Check for null bytes in DWORD1.  */
+        cmpdi   cr7,r10,0     /* If r10 == 0, no null's have been found.  */
+        bne     cr7,L(done)
+
+        ldu     r6,8(r7)
+        cmpb    r10,r6,r0
+        cmpdi   cr7,r10,0
+        bne     cr7,L(done)
+
+        ld      r12,0(r7)
+        cmpb    r10,r12,r0
+        cmpdi   cr7,r10,0
+        bne     cr7,L(done)
+
+        ldu     r6,8(r7)
+        cmpb    r10,r6,r0
+        cmpdi   cr7,r10,0
+        bne     cr7,L(done)
+
+	/* We checked for 24 - x bytes, with x being the source alignment
+	   (0 <= x <= 16), and no zero has been found.  Start the loop
+	   copy with doubleword aligned address.  */
+	mr	r7,r4
+	ld	r12, 0(r7)
+	ldu	r8, 8(r7)
+
+L(loop_before):
+	/* Save the two doublewords readed from source and align the source
+	   to 16 bytes for the loop.  */
+	mr	r11,r3
+	std	r12,0(r11)
+	std	r8,8(r11)
+	addi	r11,r11,16
+	rldicl	r9,r4,0,60
+	subf	r7,r9,r7
+	subf	r11,r9,r11
+	b	L(loop_start)
+
+        .align  5
+L(loop):
+        std     r12, 0(r11)
+        std     r6, 8(r11)
+	addi	r11,r11,16
+L(loop_start):
+        /* Load two doublewords, compare and merge in a
+           single register for speed.  This is an attempt
+           to speed up the null-checking process for bigger strings.  */
+
+        ld      r12, 8(r7)
+        ldu     r6, 16(r7)
+        cmpb    r10,r12,r0
+        cmpb    r9,r6,r0
+        or      r8,r9,r10     /* Merge everything in one doubleword.  */
+        cmpdi   cr7,r8,0
+        beq     cr7,L(loop)
+
+
+        /* OK, one (or both) of the doublewords contains a null byte.  Check
+           the first doubleword and decrement the address in case the first
+           doubleword really contains a null byte.  */
+
+	addi	r4,r7,-8
+        cmpdi   cr6,r10,0
+        addi    r7,r7,-8
+        bne     cr6,L(done2)
+
+        /* The null byte must be in the second doubleword.  Adjust the address
+           again and move the result of cmpb to r10 so we can calculate the
+           length.  */
+
+        mr      r10,r9
+        addi    r7,r7,8
+	b	L(done2)
+
+        /* r10 has the output of the cmpb instruction, that is, it contains
+           0xff in the same position as the null byte in the original
+           doubleword from the string.  Use that to calculate the length.  */
+L(done):
+	mr	r11,r3
+L(done2):
+#ifdef __LITTLE_ENDIAN__
+        addi    r9, r10, -1   /* Form a mask from trailing zeros.  */
+        andc    r9, r9, r10
+        popcntd r6, r9        /* Count the bits in the mask.  */
+#else
+        cntlzd  r6,r10        /* Count leading zeros before the match.  */
+#endif
+        subf    r5,r4,r7
+        srdi    r6,r6,3       /* Convert leading/trailing zeros to bytes.  */
+        add     r8,r5,r6      /* Compute final length.  */
+#ifdef USE_AS_STPCPY
+	/* stpcpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r8
+#endif
+	addi	r8,r8,1       /* Final '/0'.  */
+
+	cmpldi	cr6,r8,8
+	mtocrf	0x01,r8
+	ble	cr6,L(copy_LE_8)
+
+	cmpldi	cr1,r8,16
+	blt	cr1,8f
+
+	/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32):
+	/* At least 6 bytes to go.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	ld	r6,0(r4)
+	ld	r8,8(r4)
+	addi	r4,r4,16
+	std	r6,0(r11)
+	std	r8,8(r11)
+	addi	r11,r11,16
+8:	/* Copy 8 bytes.  */
+	bf	28,L(tail4)
+	ld	r6,0(r4)
+	addi	r4,r4,8
+	std	r6,0(r11)
+	addi	r11,r11,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4):
+	bf	29,L(tail2)
+	lwz	r6,0(r4)
+	stw	r6,0(r11)
+	bf	30,L(tail5)
+	lhz	r7,4(r4)
+	sth	r7,4(r11)
+	bflr	31
+	lbz	r8,6(r4)
+	stb	r8,6(r11)
+	blr
+
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2):
+	bf	30,1f
+	lhz	r6,0(r4)
+	sth	r6,0(r11)
+	bflr	31
+	lbz	r7,2(r4)
+	stb	r7,2(r11)
+	blr
+
+	.align	4
+L(tail5):
+	bf	31,1f
+	lbz	r6,4(r4)
+	stb	r6,4(r11)
+	blr
+
+	.align	4
+1:
+	bflr	31
+	lbz	r6,0(r4)
+	stb	r6,0(r11)
+	blr
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8):
+	bne	cr6,L(tail4)
+	ld	r6,0(r4)
+	std	r6,0(r11)
+	blr
+END (FUNC_NAME)
+
+#ifndef USE_AS_STPCPY
+libc_hidden_builtin_def (strcpy)
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcspn.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcspn.S
new file mode 100644
index 0000000000..c9a7a2e3c3
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcspn.S
@@ -0,0 +1,20 @@
+/* Optimized strcspn implementation for PowerPC64/POWER8.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STRCSPN 1
+#include <sysdeps/powerpc/powerpc64/power8/strspn.S>
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strlen.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strlen.S
new file mode 100644
index 0000000000..8f4a1fc1dc
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strlen.S
@@ -0,0 +1,301 @@
+/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized
+   loop.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* TODO: change these to the actual instructions when the minimum required
+   binutils allows it.  */
+#define MFVRD(r,v)	.long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t,a,b)	.long (0x1000054c \
+			       | ((t)<<(32-11))	\
+			       | ((a)<<(32-16))	\
+			       | ((b)<<(32-21)) )
+
+/* int [r3] strlen (char *s [r3])  */
+
+#ifndef STRLEN
+# define STRLEN strlen
+#endif
+
+/* TODO: change this to .machine power8 when the minimum required binutils
+   allows it.  */
+	.machine  power7
+EALIGN (STRLEN, 4, 0)
+	CALL_MCOUNT 1
+	dcbt	0,r3
+	clrrdi	r4,r3,3	      /* Align the address to doubleword boundary.  */
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+	li	r0,0	      /* Doubleword with null chars to use
+				 with cmpb.  */
+	li	r5,-1	      /* MASK = 0xffffffffffffffff.  */
+	ld	r12,0(r4)     /* Load doubleword from memory.  */
+#ifdef __LITTLE_ENDIAN__
+	sld	r5,r5,r6
+#else
+	srd	r5,r5,r6      /* MASK = MASK >> padding.  */
+#endif
+	orc	r9,r12,r5     /* Mask bits that are not part of the string.  */
+	cmpb	r10,r9,r0     /* Check for null bytes in DWORD1.  */
+	cmpdi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
+	bne	cr7,L(done)
+
+	/* For shorter strings (< 64 bytes), we will not use vector registers,
+	   as the overhead isn't worth it.  So, let's use GPRs instead.  This
+	   will be done the same way as we do in the POWER7 implementation.
+	   Let's see if we are aligned to a quadword boundary.  If so, we can
+	   jump to the first (non-vectorized) loop.  Otherwise, we have to
+	   handle the next DWORD first.  */
+	mtcrf	0x01,r4
+	mr	r9,r4
+	addi	r9,r9,8
+	bt	28,L(align64)
+
+	/* Handle the next 8 bytes so we are aligned to a quadword
+	   boundary.  */
+	ldu	r5,8(r4)
+	cmpb	r10,r5,r0
+	cmpdi	cr7,r10,0
+	addi	r9,r9,8
+	bne	cr7,L(done)
+
+L(align64):
+	/* Proceed to the old (POWER7) implementation, checking two doublewords
+	   per iteraction.  For the first 56 bytes, we will just check for null
+	   characters.  After that, we will also check if we are 64-byte aligned
+	   so we can jump to the vectorized implementation.  We will unroll
+	   these loops to avoid excessive branching.  */
+	ld	r6,8(r4)
+	ldu	r5,16(r4)
+	cmpb	r10,r6,r0
+	cmpb	r11,r5,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	addi	r9,r9,16
+	bne	cr7,L(dword_zero)
+
+	ld	r6,8(r4)
+	ldu	r5,16(r4)
+	cmpb	r10,r6,r0
+	cmpb	r11,r5,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	addi	r9,r9,16
+	bne	cr7,L(dword_zero)
+
+	ld	r6,8(r4)
+	ldu	r5,16(r4)
+	cmpb	r10,r6,r0
+	cmpb	r11,r5,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	addi	r9,r9,16
+	bne	cr7,L(dword_zero)
+
+	/* Are we 64-byte aligned? If so, jump to the vectorized loop.
+	   Note: aligning to 64-byte will necessarily slow down performance for
+	   strings around 64 bytes in length due to the extra comparisons
+	   required to check alignment for the vectorized loop.  This is a
+	   necessary tradeoff we are willing to take in order to speed up the
+	   calculation for larger strings.  */
+	andi.	r10,r9,63
+	beq	cr0,L(preloop)
+	ld	r6,8(r4)
+	ldu	r5,16(r4)
+	cmpb	r10,r6,r0
+	cmpb	r11,r5,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	addi	r9,r9,16
+	bne	cr7,L(dword_zero)
+
+	andi.	r10,r9,63
+	beq	cr0,L(preloop)
+	ld	r6,8(r4)
+	ldu	r5,16(r4)
+	cmpb	r10,r6,r0
+	cmpb	r11,r5,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	addi	r9,r9,16
+	bne	cr7,L(dword_zero)
+
+	andi.	r10,r9,63
+	beq	cr0,L(preloop)
+	ld	r6,8(r4)
+	ldu	r5,16(r4)
+	cmpb	r10,r6,r0
+	cmpb	r11,r5,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	addi	r9,r9,16
+	bne	cr7,L(dword_zero)
+
+	andi.	r10,r9,63
+	beq	cr0,L(preloop)
+	ld	r6,8(r4)
+	ldu	r5,16(r4)
+	cmpb	r10,r6,r0
+	cmpb	r11,r5,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	addi	r9,r9,16
+
+	/* At this point, we are necessarily 64-byte aligned.  If no zeroes were
+	   found, jump to the vectorized loop.  */
+	beq	cr7,L(preloop)
+
+L(dword_zero):
+	/* OK, one (or both) of the doublewords contains a null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a null byte.  */
+
+	cmpdi	cr6,r10,0
+	addi	r4,r4,-8
+	bne	cr6,L(done)
+
+	/* The null byte must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   length.  */
+
+	mr	r10,r11
+	addi	r4,r4,8
+
+	/* If the null byte was found in the non-vectorized code, compute the
+	   final length.  r10 has the output of the cmpb instruction, that is,
+	   it contains 0xff in the same position as the null byte in the
+	   original doubleword from the string.  Use that to calculate the
+	   length.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi	r9, r10,-1    /* Form a mask from trailing zeros.  */
+	andc	r9, r9,r10
+	popcntd	r0, r9	      /* Count the bits in the mask.  */
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
+#endif
+	subf	r5,r3,r4
+	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
+	add	r3,r5,r0      /* Compute final length.  */
+	blr
+
+	/* Vectorized implementation starts here.  */
+	.p2align  4
+L(preloop):
+	/* Set up for the loop.  */
+	mr	r4,r9
+	li	r7, 16	      /* Load required offsets.  */
+	li	r8, 32
+	li	r9, 48
+	li	r12, 8
+	vxor	v0,v0,v0      /* VR with null chars to use with
+				 vcmpequb.  */
+
+	/* Main loop to look for the end of the string.  We will read in
+	   64-byte chunks.  Align it to 32 bytes and unroll it 3 times to
+	   leverage the icache performance.  */
+	.p2align  5
+L(loop):
+	lvx	  v1,r4,r0  /* Load 4 quadwords.  */
+	lvx	  v2,r4,r7
+	lvx	  v3,r4,r8
+	lvx	  v4,r4,r9
+	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
+	vminub	  v6,v3,v4
+	vminub	  v7,v5,v6
+	vcmpequb. v7,v7,v0  /* Check for NULLs.  */
+	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
+	bne	  cr6,L(vmx_zero)
+
+	lvx	  v1,r4,r0  /* Load 4 quadwords.  */
+	lvx	  v2,r4,r7
+	lvx	  v3,r4,r8
+	lvx	  v4,r4,r9
+	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
+	vminub	  v6,v3,v4
+	vminub	  v7,v5,v6
+	vcmpequb. v7,v7,v0  /* Check for NULLs.  */
+	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
+	bne	  cr6,L(vmx_zero)
+
+	lvx	  v1,r4,r0  /* Load 4 quadwords.  */
+	lvx	  v2,r4,r7
+	lvx	  v3,r4,r8
+	lvx	  v4,r4,r9
+	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
+	vminub	  v6,v3,v4
+	vminub	  v7,v5,v6
+	vcmpequb. v7,v7,v0  /* Check for NULLs.  */
+	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
+	beq	  cr6,L(loop)
+
+L(vmx_zero):
+	/* OK, we found a null byte.  Let's look for it in the current 64-byte
+	   block and mark it in its corresponding VR.  */
+	vcmpequb  v1,v1,v0
+	vcmpequb  v2,v2,v0
+	vcmpequb  v3,v3,v0
+	vcmpequb  v4,v4,v0
+
+	/* We will now 'compress' the result into a single doubleword, so it
+	   can be moved to a GPR for the final calculation.  First, we
+	   generate an appropriate mask for vbpermq, so we can permute bits into
+	   the first halfword.  */
+	vspltisb  v10,3
+	lvsl	  v11,r0,r0
+	vslb	  v10,v11,v10
+
+	/* Permute the first bit of each byte into bits 48-63.  */
+	VBPERMQ(v1,v1,v10)
+	VBPERMQ(v2,v2,v10)
+	VBPERMQ(v3,v3,v10)
+	VBPERMQ(v4,v4,v10)
+
+	/* Shift each component into its correct position for merging.  */
+#ifdef __LITTLE_ENDIAN__
+	vsldoi  v2,v2,v2,2
+	vsldoi  v3,v3,v3,4
+	vsldoi  v4,v4,v4,6
+#else
+	vsldoi	v1,v1,v1,6
+	vsldoi	v2,v2,v2,4
+	vsldoi	v3,v3,v3,2
+#endif
+
+	/* Merge the results and move to a GPR.  */
+	vor	v1,v2,v1
+	vor	v2,v3,v4
+	vor	v4,v1,v2
+	MFVRD(r10,v4)
+
+	 /* Adjust address to the begninning of the current 64-byte block.  */
+	addi	r4,r4,-64
+
+#ifdef __LITTLE_ENDIAN__
+	addi	r9, r10,-1    /* Form a mask from trailing zeros.  */
+	andc	r9, r9,r10
+	popcntd	r0, r9	      /* Count the bits in the mask.  */
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
+#endif
+	subf	r5,r3,r4
+	add	r3,r5,r0      /* Compute final length.  */
+	blr
+
+END (STRLEN)
+libc_hidden_builtin_def (strlen)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncase.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncase.S
new file mode 100644
index 0000000000..32e09e4d94
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncase.S
@@ -0,0 +1,20 @@
+/* Optimized strncasecmp implementation for POWER8.
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STRNCASECMP 1
+#include <sysdeps/powerpc/powerpc64/power8/strcasecmp.S>
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncmp.S
new file mode 100644
index 0000000000..3d8df90538
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncmp.S
@@ -0,0 +1,327 @@
+/* Optimized strncmp implementation for PowerPC64/POWER8.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifndef STRNCMP
+# define STRNCMP strncmp
+#endif
+
+/* Implements the function
+
+   int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n)
+
+   The implementation uses unaligned doubleword access to avoid specialized
+   code paths depending of data alignment.  Although recent powerpc64 uses
+   64K as default, the page cross handling assumes minimum page size of
+   4k.  */
+
+	.machine  power7
+EALIGN (STRNCMP, 4, 0)
+	/* Check if size is 0.  */
+	mr.	r10,r5
+	beq	cr0,L(ret0)
+
+	/* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
+	   the code:
+
+	    (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
+
+	   with PAGE_SIZE being 4096 and ITER_SIZE begin 16.  */
+	rldicl	r8,r3,0,52
+	cmpldi	cr7,r8,4096-16
+	bgt	cr7,L(pagecross)
+	rldicl	r9,r4,0,52
+	cmpldi	cr7,r9,4096-16
+	bgt	cr7,L(pagecross)
+
+	/* For short string up to 16 bytes, load both s1 and s2 using
+	   unaligned dwords and compare.  */
+	ld	r7,0(r3)
+	ld	r9,0(r4)
+	li	r8,0
+	cmpb	r8,r7,r8
+	cmpb	r6,r7,r9
+	orc.	r8,r8,r6
+	bne	cr0,L(different1)
+
+	/* If the string compared are equal, but size is less or equal
+	   to 8, return 0.  */
+	cmpldi	cr7,r10,8
+	li	r9,0
+	ble	cr7,L(ret1)
+	addi	r5,r10,-8
+
+	ld	r7,8(r3)
+	ld	r9,8(r4)
+	cmpb	r8,r7,r8
+	cmpb	r6,r7,r9
+	orc.	r8,r8,r6
+	bne	cr0,L(different0)
+
+	cmpldi	cr7,r5,8
+	mr	r9,r8
+	ble	cr7,L(ret1)
+
+	/* Update pointers and size.  */
+	addi	r10,r10,-16
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+	/* Now it has checked for first 16 bytes, align source1 to doubleword
+	   and adjust source2 address.  */
+L(align_8b):
+	rldicl	r5,r3,0,61
+	rldicr	r3,r3,0,60
+	subf	r4,r5,r4
+	add	r10,r10,r5
+
+	/* At this point, source1 alignment is 0 and source2 alignment is
+	   between 0 and 7.  Check is source2 alignment is 0, meaning both
+	   sources have the same alignment.  */
+	andi.	r8,r4,0x7
+	beq	cr0,L(loop_eq_align_0)
+
+	li	r5,0
+	b	L(loop_ne_align_1)
+
+	/* If source2 is unaligned to doubleword, the code needs to check
+	   on each interation if the unaligned doubleword access will cross
+	   a 4k page boundary.  */
+	.align 4
+L(loop_ne_align_0):
+	ld	r7,0(r3)
+	ld	r9,0(r4)
+	cmpb	r8,r7,r5
+	cmpb	r6,r7,r9
+	orc.	r8,r8,r6
+	bne	cr0,L(different1)
+
+	cmpldi	cr7,r10,8
+	ble	cr7,L(ret0)
+	addi	r10,r10,-8
+	addi	r3,r3,8
+	addi	r4,r4,8
+L(loop_ne_align_1):
+	rldicl	r9,r4,0,52
+	cmpldi	r7,r9,4088
+	ble	cr7,L(loop_ne_align_0)
+	cmpdi	cr7,r10,0
+	beq	cr7,L(ret0)
+
+	lbz	r9,0(r3)
+	lbz	r8,0(r4)
+	cmplw	cr7,r9,r8
+	bne	cr7,L(byte_ne_4)
+	cmpdi	cr7,r9,0
+	beq	cr7,L(size_reached_0)
+
+	li	r9,r7
+	addi	r8,r3,1
+	mtctr	r9
+	addi	r4,r4,1
+	addi	r10,r10,-1
+	addi	r3,r3,8
+
+	/* The unaligned read of source2 will cross a 4K page boundary,
+	   and the different byte or NULL maybe be in the remaining page
+	   bytes.  Since it can not use the unaligned load the algorithm
+	   reads and compares 8 bytes to keep source1 doubleword aligned.  */
+	.align 4
+L(loop_ne_align_byte):
+	cmpdi	cr7,r10,0
+	addi	r10,r10,-1
+	beq	cr7,L(ret0)
+	lbz	r9,0(r8)
+	lbz	r7,0(r4)
+	addi	r8,r8,1
+	addi	r4,r4,1
+	cmplw	cr7,r9,r7
+	cmpdi	cr5,r9,0
+	bne	cr7,L(size_reached_2)
+	beq	cr5,L(size_reached_0)
+	bdnz	L(loop_ne_align_byte)
+
+	cmpdi	cr7,r10,0
+	bne+	cr7,L(loop_ne_align_0)
+
+	.align 4
+L(ret0):
+	li	r9,0
+L(ret1):
+	mr	r3,r9
+	blr
+
+	/* The code now check if r8 and r10 are different by issuing a
+	   cmpb and shift the result based on its output:
+
+	#ifdef __LITTLE_ENDIAN__
+	  leadzero = (__builtin_ffsl (z1) - 1);
+	  leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
+	  r1 = (r1 >> leadzero) & 0xFFUL;
+	  r2 = (r2 >> leadzero) & 0xFFUL;
+	#else
+	  leadzero = __builtin_clzl (z1);
+	  leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
+	  r1 = (r1 >> (56 - leadzero)) & 0xFFUL;
+	  r2 = (r2 >> (56 - leadzero)) & 0xFFUL;
+	#endif
+	  return r1 - r2;  */
+
+	.align 4
+L(different0):
+	mr	r10,r5
+#ifdef __LITTLE_ENDIAN__
+L(different1):
+        neg	r11,r8
+        sldi	r10,r10,3
+        and	r8,r11,r8
+        addi	r10,r10,-8
+        cntlzd	r8,r8
+        subfic	r8,r8,63
+        extsw 	r8,r8
+        cmpld	cr7,r8,r10
+        ble	cr7,L(different2)
+        mr	r8,r10
+L(different2):
+        extsw	r8,r8
+#else
+L(different1):
+	addi	r10,r10,-1
+	cntlzd	r8,r8
+	sldi	r10,r10,3
+	cmpld	cr7,r8,r10
+	blt	cr7,L(different2)
+	mr	r8,r10
+L(different2):
+	subfic	r8,r8,56
+#endif
+	srd	r7,r7,r8
+	srd	r9,r9,r8
+	rldicl	r3,r7,0,56
+	rldicl	r9,r9,0,56
+	subf	r9,r9,3
+	extsw	r9,r9
+	mr	r3,r9
+	blr
+
+	/* If unaligned 16 bytes reads across a 4K page boundary, it uses
+	   a simple byte a byte comparison until the page alignment for s1
+	   is reached.  */
+	.align 4
+L(pagecross):
+	lbz	r7,0(r3)
+	lbz	r9,0(r4)
+	subfic	r8,r8,4095
+	cmplw	cr7,r9,r7
+	bne	cr7,L(byte_ne_3)
+	cmpdi	cr7,r9,0
+	beq	cr7,L(byte_ne_0)
+	addi	r10,r10,-1
+	subf	r7,r8,r10
+	subf	r9,r7,r10
+	addi	r9,r9,1
+	mtctr	r9
+	b	L(pagecross_loop1)
+
+	.align 4
+L(pagecross_loop0):
+	beq	cr7,L(ret0)
+	lbz	r9,0(r3)
+	lbz	r8,0(r4)
+	addi	r10,r10,-1
+	cmplw	cr7,r9,r8
+	cmpdi	cr5,r9,0
+	bne	r7,L(byte_ne_2)
+	beq	r5,L(byte_ne_0)
+L(pagecross_loop1):
+	cmpdi	cr7,r10,0
+	addi	r3,r3,1
+	addi	r4,r4,1
+	bdnz	L(pagecross_loop0)
+	cmpdi	cr7,r7,0
+	li	r9,0
+	bne+	cr7,L(align_8b)
+	b	L(ret1)
+
+	/* If both source1 and source2 are doubleword aligned, there is no
+	   need for page boundary cross checks.  */
+	.align 4
+L(loop_eq_align_0):
+	ld	r7,0(r3)
+	ld	r9,0(r4)
+	cmpb	r8,r7,r8
+	cmpb	r6,r7,r9
+	orc.	r8,r8,r6
+	bne	cr0,L(different1)
+
+	cmpldi	cr7,r10,8
+	ble	cr7,L(ret0)
+	addi	r9,r10,-9
+
+	li	r5,0
+	srdi	r9,r9,3
+	addi	r9,r9,1
+	mtctr	r9
+	b	L(loop_eq_align_2)
+
+	.align 4
+L(loop_eq_align_1):
+	bdz	L(ret0)
+L(loop_eq_align_2):
+	ldu	r7,8(r3)
+	addi	r10,r10,-8
+	ldu	r9,8(r4)
+	cmpb	r8,r7,r5
+	cmpb	r6,r7,r9
+	orc.	r8,r8,r6
+	beq	cr0,L(loop_eq_align_1)
+	b	L(different1)
+
+	.align 4
+L(byte_ne_0):
+	li	r7,0
+L(byte_ne_1):
+	subf	r9,r9,r7
+	extsw	r9,r9
+	b	L(ret1)
+
+	.align 4
+L(byte_ne_2):
+	extsw	r7,r9
+	mr	r9,r8
+	b	L(byte_ne_1)
+L(size_reached_0):
+	li	r10,0
+L(size_reached_1):
+	subf	r9,r9,r10
+	extsw	r9,r9
+	b	L(ret1)
+L(size_reached_2):
+	extsw	r10,r9
+	mr	r9,r7
+	b	L(size_reached_1)
+L(byte_ne_3):
+	extsw	r7,r7
+	b	L(byte_ne_1)
+L(byte_ne_4):
+	extsw	r10,r9
+	mr	r9,r8
+	b	L(size_reached_1)
+END(STRNCMP)
+libc_hidden_builtin_def(strncmp)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncpy.S
new file mode 100644
index 0000000000..6d40f30ff7
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncpy.S
@@ -0,0 +1,465 @@
+/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STPNCPY
+# ifndef STPNCPY
+#   define FUNC_NAME __stpncpy
+# else
+#   define FUNC_NAME STPNCPY
+# endif
+#else
+# ifndef STRNCPY
+#  define FUNC_NAME strncpy
+# else
+#  define FUNC_NAME STRNCPY
+# endif
+#endif  /* !USE_AS_STPNCPY  */
+
+#ifndef MEMSET
+/* For builds without IFUNC support, local calls should be made to internal
+   GLIBC symbol (created by libc_hidden_builtin_def).  */
+# ifdef SHARED
+#  define MEMSET   __GI_memset
+# else
+#  define MEMSET   memset
+# endif
+#endif
+
+#define FRAMESIZE (FRAME_MIN_SIZE+48)
+
+/* Implements the function
+
+   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   or
+
+   char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   if USE_AS_STPCPY is defined.
+
+   The implementation uses unaligned doubleword access to avoid specialized
+   code paths depending of data alignment.  Although recent powerpc64 uses
+   64K as default, the page cross handling assumes minimum page size of
+   4k.  */
+
+	.machine  power7
+EALIGN (FUNC_NAME, 4, 0)
+
+        /* Check if the [src]+15 will cross a 4K page by checking if the bit
+           indicating the page size changes.  Basically:
+
+           uint64_t srcin = (uint64_t)src;
+           uint64_t ob = srcin & 4096UL;
+           uint64_t nb = (srcin+15UL) & 4096UL;
+           if (ob ^ nb)
+             goto pagecross;  */
+
+	addi	r10,r4,16
+	rlwinm	r9,r4,0,19,19
+
+	/* Save some non-volatile registers on the stack.  */
+	std	r26,-48(r1)
+	std	r27,-40(r1)
+
+	rlwinm	r8,r10,0,19,19
+
+	std	r28,-32(r1)
+	std	r29,-24(r1)
+
+	cmpld	cr7,r9,r8
+
+	std	r30,-16(r1)
+	std	r31,-8(r1)
+
+	/* Update CFI.  */
+	cfi_offset(r26, -48)
+	cfi_offset(r27, -40)
+	cfi_offset(r28, -32)
+	cfi_offset(r29, -24)
+	cfi_offset(r30, -16)
+	cfi_offset(r31, -8)
+
+	beq	cr7,L(unaligned_lt_16)
+	rldicl	r9,r4,0,61
+	subfic	r8,r9,8
+	cmpld	cr7,r5,r8
+	bgt 	cr7,L(pagecross)
+
+	/* At this points there is 1 to 15 bytes to check and write.  Since it could
+	   be either from first unaligned 16 bytes access or from bulk copy, the code
+	   uses an unrolled byte read/write instead of trying to analyze the cmpb
+	   results.  */
+L(short_path):
+	mr	r9,r3
+L(short_path_1):
+	/* Return if there are no more bytes to be written.  */
+	cmpdi	cr7,r5,0
+	beq	cr7,L(short_path_loop_end_1)
+L(short_path_2):
+	/* Copy one char from src (r4) and write it to dest (r9).  If it is the
+	   end-of-string, start the null padding.  Continue, otherwise.  */
+	lbz	r10,0(r4)
+	cmpdi	cr7,r10,0
+	stb	r10,0(r9)
+	beq	cr7,L(zero_pad_start_1)
+	/* If there are no more bytes to be written, return.  */
+	cmpdi	cr0,r5,1
+	addi	r8,r9,1
+	addi	r6,r5,-1
+	beq	cr0,L(short_path_loop_end_0)
+	/* Copy another char from src (r4) to dest (r9).  Check again if it is
+	   the end-of-string.  If so, start the null padding.  */
+	lbz	r10,1(r4)
+	cmpdi	cr7,r10,0
+	stb	r10,1(r9)
+	beq	cr7,L(zero_pad_start_prepare_1)
+	/* Eagerly decrement r5 by 3, which is the number of bytes already
+	   written, plus one write that will be performed later on.  */
+	addi	r10,r5,-3
+	b	L(short_path_loop_1)
+
+	.align	4
+L(short_path_loop):
+	/* At this point, the induction variable, r5, as well as the pointers
+	   to dest and src (r9 and r4, respectivelly) have been updated.
+
+	   Note: The registers r7 and r10 are induction variables derived from
+	   r5.  They are used to determine if the total number of writes has
+	   been reached at every other write.
+
+	   Copy one char from src (r4) and write it to dest (r9).  If it is the
+	   end-of-string, start the null padding.  Continue, otherwise.  */
+	lbz	r8,0(r4)
+	addi	r7,r10,-2
+	cmpdi	cr5,r8,0
+	stb	r8,0(r9)
+	beq	cr5,L(zero_pad_start_1)
+	beq	cr7,L(short_path_loop_end_0)
+	/* Copy another char from src (r4) to dest (r9).  Check again if it is
+	   the end-of-string.  If so, start the null padding.  */
+	lbz	r8,1(r4)
+	cmpdi	cr7,r8,0
+	stb	r8,1(r9)
+	beq	cr7,L(zero_pad_start)
+	mr	r10,r7
+L(short_path_loop_1):
+	/* This block is reached after two chars have been already written to
+	   dest.  Nevertheless, r5 (the induction variable), r9 (the pointer to
+	   dest), and r4 (the pointer to src) have not yet been updated.
+
+	   At this point:
+	     r5 holds the count of bytes yet to be written plus 2.
+	     r9 points to the last two chars that were already written to dest.
+	     r4 points to the last two chars that were already copied from src.
+
+	   The algorithm continues by decrementing r5, the induction variable,
+	   so that it reflects the last two writes.  The pointers to dest (r9)
+	   and to src (r4) are increment by two, for the same reason.
+
+	   Note: Register r10 is another induction variable, derived from r5,
+	   which determines if the total number of writes has been reached.  */
+	addic.	r5,r5,-2
+	addi	r9,r9,2
+	cmpdi	cr7,r10,0 /* Eagerly check if the next write is the last.  */
+	addi	r4,r4,2
+	addi	r6,r9,1
+	bne	cr0,L(short_path_loop) /* Check if the total number of writes
+					  has been reached at every other
+					  write.  */
+#ifdef USE_AS_STPNCPY
+	mr	r3,r9
+	b	L(short_path_loop_end)
+#endif
+
+L(short_path_loop_end_0):
+#ifdef USE_AS_STPNCPY
+	addi	r3,r9,1
+	b	L(short_path_loop_end)
+#endif
+L(short_path_loop_end_1):
+#ifdef USE_AS_STPNCPY
+	mr	r3,r9
+#endif
+L(short_path_loop_end):
+	/* Restore non-volatile registers.  */
+	ld	r26,-48(r1)
+	ld	r27,-40(r1)
+	ld	r28,-32(r1)
+	ld	r29,-24(r1)
+	ld	r30,-16(r1)
+	ld	r31,-8(r1)
+	blr
+
+	/* This code pads the remainder of dest with NULL bytes.  The algorithm
+	   calculates the remaining size and calls memset.  */
+	.align	4
+L(zero_pad_start):
+	mr	r5,r10
+	mr	r9,r6
+L(zero_pad_start_1):
+	/* At this point:
+	     - r5 holds the number of bytes that still have to be written to
+	       dest.
+	     - r9 points to the position, in dest, where the first null byte
+	       will be written.
+	   The above statements are true both when control reaches this label
+	   from a branch or when falling through the previous lines.  */
+#ifndef USE_AS_STPNCPY
+	mr	r30,r3       /* Save the return value of strncpy.  */
+#endif
+	/* Prepare the call to memset.  */
+	mr	r3,r9        /* Pointer to the area to be zero-filled.  */
+	li	r4,0         /* Byte to be written (zero).  */
+
+	/* We delayed the creation of the stack frame, as well as the saving of
+	   the link register, because only at this point, we are sure that
+	   doing so is actually needed.  */
+
+	/* Save the link register.  */
+	mflr	r0
+	std	r0,16(r1)
+	cfi_offset(lr, 16)
+
+	/* Create the stack frame.  */
+	stdu	r1,-FRAMESIZE(r1)
+	cfi_adjust_cfa_offset(FRAMESIZE)
+
+	bl	MEMSET
+	nop
+
+	/* Restore the stack frame.  */
+	addi	r1,r1,FRAMESIZE
+	cfi_adjust_cfa_offset(-FRAMESIZE)
+	/* Restore the link register.  */
+	ld	r0,16(r1)
+	mtlr	r0
+
+#ifndef USE_AS_STPNCPY
+	mr	r3,r30       /* Restore the return value of strncpy, i.e.:
+				dest.  For stpncpy, the return value is the
+				same as return value of memset.  */
+#endif
+
+	/* Restore non-volatile registers and return.  */
+	ld	r26,-48(r1)
+	ld	r27,-40(r1)
+	ld	r28,-32(r1)
+	ld	r29,-24(r1)
+	ld	r30,-16(r1)
+	ld	r31,-8(r1)
+	blr
+
+	/* The common case where [src]+16 will not cross a 4K page boundary.
+	   In this case the code fast check the first 16 bytes by using doubleword
+	   read/compares and update destiny if neither total size or null byte
+	   is found in destiny. */
+	.align	4
+L(unaligned_lt_16):
+	cmpldi	cr7,r5,7
+	ble	cr7,L(short_path)
+	ld	r7,0(r4)
+	li	r8,0
+	cmpb	r8,r7,r8
+	cmpdi	cr7,r8,0
+	bne	cr7,L(short_path_prepare_2)
+	addi	r6,r5,-8
+	std	r7,0(r3)
+	addi	r9,r3,8
+	cmpldi	cr7,r6,7
+	addi	r7,r4,8
+	ble	cr7,L(short_path_prepare_1_1)
+	ld	r4,8(r4)
+	cmpb	r8,r4,r8
+	cmpdi	cr7,r8,0
+	bne	cr7,L(short_path_prepare_2_1)
+	std	r4,8(r3)
+	addi	r29,r3,16
+	addi	r5,r5,-16
+	/* Neither the null byte was found or total length was reached,
+	   align to 16 bytes and issue a bulk copy/compare.  */
+	b	L(align_to_16b)
+
+	/* In the case of 4k page boundary cross, the algorithm first align
+	   the address to a doubleword, calculate a mask based on alignment
+	   to ignore the bytes and continue using doubleword.  */
+	.align	4
+L(pagecross):
+	rldicr	r11,r4,0,59	/* Align the address to 8 bytes boundary.  */
+	li	r6,-1		/* MASK = 0xffffffffffffffffUL.  */
+	sldi	r9,r9,3		/* Calculate padding.  */
+	ld	r7,0(r11)	/* Load doubleword from memory.  */
+#ifdef __LITTLE_ENDIAN__
+	sld	r9,r6,r9	/* MASK = MASK << padding.  */
+#else
+	srd	r9,r6,r9	/* MASK = MASK >> padding.  */
+#endif
+	orc	r9,r7,r9	/* Mask bits that are not part of the
+				   string.  */
+	li	r7,0
+	cmpb	r9,r9,r7	/* Check for null bytes in DWORD1.  */
+	cmpdi	cr7,r9,0
+	bne	cr7,L(short_path_prepare_2)
+	subf	r8,r8,r5	/* Adjust total length.  */
+	cmpldi	cr7,r8,8	/* Check if length was reached.  */
+	ble	cr7,L(short_path_prepare_2)
+
+	/* For next checks we have aligned address, so we check for more
+	   three doublewords to make sure we can read 16 unaligned bytes
+	   to start the bulk copy with 16 aligned addresses.  */
+	ld	r7,8(r11)
+	cmpb	r9,r7,r9
+	cmpdi	cr7,r9,0
+	bne	cr7,L(short_path_prepare_2)
+	addi	r7,r8,-8
+	cmpldi	cr7,r7,8
+	ble	cr7,L(short_path_prepare_2)
+	ld	r7,16(r11)
+	cmpb	r9,r7,r9
+	cmpdi	cr7,r9,0
+	bne	cr7,L(short_path_prepare_2)
+	addi	r8,r8,-16
+	cmpldi	cr7,r8,8
+	ble	cr7,L(short_path_prepare_2)
+	ld	r8,24(r11)
+	cmpb	r9,r8,r9
+	cmpdi	cr7,r9,0
+	bne	cr7,L(short_path_prepare_2)
+
+	/* No null byte found in the 32 bytes readed and length not reached,
+	   read source again using unaligned loads and store them.  */
+	ld	r9,0(r4)
+	addi	r29,r3,16
+	addi	r5,r5,-16
+	std	r9,0(r3)
+	ld	r9,8(r4)
+	std	r9,8(r3)
+
+	/* Align source to 16 bytes and adjust destiny and size.  */
+L(align_to_16b):
+	rldicl	r9,r10,0,60
+	rldicr	r28,r10,0,59
+	add	r12,r5,r9
+	subf	r29,r9,r29
+
+	/* The bulk read/compare/copy loads two doublewords, compare and merge
+	   in a single register for speed.  This is an attempt to speed up the
+	   null-checking process for bigger strings.  */
+
+	cmpldi	cr7,r12,15
+	ble	cr7,L(short_path_prepare_1_2)
+
+	/* Main loop for large sizes, unrolled 2 times to get better use of
+	   pipeline.  */
+	ld	r8,0(28)
+	ld	r10,8(28)
+	li	r9,0
+	cmpb	r7,r8,r9
+	cmpb	r9,r10,r9
+	or.	r6,r9,r7
+	bne	cr0,L(short_path_prepare_2_3)
+	addi	r5,r12,-16
+	addi	r4,r28,16
+	std	r8,0(r29)
+	std	r10,8(r29)
+	cmpldi	cr7,r5,15
+	addi	r9,r29,16
+	ble	cr7,L(short_path_1)
+	mr	r11,r28
+	mr	r6,r29
+	li	r30,0
+	subfic	r26,r4,48
+	subfic	r27,r9,48
+
+	b	L(loop_16b)
+
+	.align	4
+L(loop_start):
+	ld	r31,0(r11)
+	ld	r10,8(r11)
+	cmpb	r0,r31,r7
+	cmpb	r8,r10,r7
+	or.	r7,r0,r8
+	addi	r5,r5,-32
+	cmpldi	cr7,r5,15
+	add	r4,r4,r26
+	add	r9,r9,r27
+	bne	cr0,L(short_path_prepare_2_2)
+	add	r4,r28,r4
+	std	r31,0(r6)
+	add	r9,r29,r9
+	std	r10,8(r6)
+	ble	cr7,L(short_path_1)
+
+L(loop_16b):
+	ld	r10,16(r11)
+	ld	r0,24(r11)
+	cmpb	r8,r10,r30
+	cmpb	r7,r0,r30
+	or.	r7,r8,r7
+	addi	r12,r12,-32
+	cmpldi	cr7,r12,15
+	addi	r11,r11,32
+	bne	cr0,L(short_path_2)
+	std	r10,16(r6)
+	addi	r6,r6,32
+	std	r0,-8(r6)
+	bgt	cr7,L(loop_start)
+
+	mr	r5,r12
+	mr	r4,r11
+	mr	r9,r6
+	b	L(short_path_1)
+
+	.align	4
+L(short_path_prepare_1_1):
+	mr	r5,r6
+	mr	r4,r7
+	b	L(short_path_1)
+L(short_path_prepare_1_2):
+	mr	r5,r12
+	mr	r4,r28
+	mr	r9,r29
+	b	L(short_path_1)
+L(short_path_prepare_2):
+	mr	r9,r3
+	b	L(short_path_2)
+L(short_path_prepare_2_1):
+	mr	r5,r6
+	mr	r4,r7
+	b	L(short_path_2)
+L(short_path_prepare_2_2):
+	mr	r5,r12
+	mr	r4,r11
+	mr	r9,r6
+	b	L(short_path_2)
+L(short_path_prepare_2_3):
+	mr	r5,r12
+	mr	r4,r28
+	mr	r9,r29
+	b	L(short_path_2)
+L(zero_pad_start_prepare_1):
+	mr	r5,r6
+	mr	r9,r8
+	b	L(zero_pad_start_1)
+END (FUNC_NAME)
+
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strnlen.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strnlen.S
new file mode 100644
index 0000000000..3eadbfb09e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strnlen.S
@@ -0,0 +1,433 @@
+/* Optimized strnlen implementation for POWER8 using a vmx loop.
+
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* It is implemented the following heuristic:
+	1. Case maxlen <= 32: align the pointer to 8 bytes to loop through
+	reading doublewords. Uses the POWER7 algorithm.
+	2. Case maxlen > 32: check for null bytes in the first 16 bytes using
+	unaligned accesses. Return length if found. Otherwise:
+		2.1 Case maxlen < 64: deduct the bytes previously read, align
+		the pointer to 16 bytes and loop through reading quadwords
+		until find null bytes or reach maxlen.
+		2.2 Case maxlen > 64: deduct the bytes previously read, align
+		the pointer to 64 bytes and set up a counter to loop through
+		reading in strides of 64 bytes. In case it finished the loop
+		with null bytes not found, process the remainder bytes by
+		switching to the loop to heuristic in 2.1.  */
+
+#include <sysdep.h>
+
+/* Define default page size to 4KB.  */
+#define PAGE_SIZE 4096
+
+/* The following macros implement Power ISA v2.07 opcodes
+   that could not be used directly into this code to the keep
+   compatibility with older binutils versions.  */
+
+/* Move from vector register doubleword.  */
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+
+/* Move to vector register doubleword.  */
+#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+
+/* Vector Bit Permute Quadword.  */
+#define VBPERMQ(t,a,b)	.long (0x1000054c	\
+			       | ((t)<<(32-11))	\
+			       | ((a)<<(32-16))	\
+			       | ((b)<<(32-21)) )
+
+/* Vector Population Count Halfword.  */
+#define VPOPCNTH(t,b) .long (0x10000743 | ((t)<<(32-11)) | ((b)<<(32-21)))
+
+/* Vector Count Leading Zeros Halfword.  */
+#define VCLZH(t,b) .long (0x10000742 | ((t)<<(32-11)) | ((b)<<(32-21)))
+
+
+/* int [r3] strnlen (char *s [r3], size_t maxlen [r4])  */
+/* TODO: change to power8 when minimum required binutils allows it.  */
+	.machine  power7
+ENTRY (__strnlen)
+	CALL_MCOUNT 2
+	dcbt	0,r3
+
+	cmpldi	r4,32           /* Check if maxlen <= 32.  */
+	ble	L(small_range)  /* If maxlen <= 32.  */
+
+	/* Upcoming 16 bytes unaligned accesses cannot cross the page boundary
+	   otherwise the processor throws an memory access error.
+	   Use following code to check there is room for such as accesses:
+	     (((size_t) s) % PAGE_SIZE > (PAGE_SIZE - 16)
+	   If it is disallowed then switch to the code that handles
+	   the string when maxlen <= 32.  */
+	clrldi	r10,r3,52
+	cmpldi  cr7,r10,PAGE_SIZE-16
+	bgt     cr7,L(small_range)	/* If less than 16B of page end.  */
+
+	/* Compute our permute constant r8.  */
+	li	r7,0
+	/* Compute a bpermd constant to move bit 0 of each word into
+	   a halfword value, and count trailing zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	li	r8,0x2820
+	oris	r8,r8,0x3830
+	sldi	r8,r8,32
+	ori	r8,r8,0x0800
+	oris	r8,r8,0x1810
+#else
+	li	r8,0x1018
+	oris	r8,r8,0x0008
+	sldi	r8,r8,32
+	ori	r8,r8,0x3038
+	oris	r8,r8,0x2028
+#endif
+
+	/* maxlen > 32. Optimistically check for null bytes in the first
+	   16 bytes of the string using unaligned accesses.  */
+	ld	r5,0(r3)
+	ld	r6,8(r3)
+	cmpb	r10,r7,r5		/* Check for null bytes in DWORD1.  */
+	cmpb	r11,r7,r6		/* Check for null bytes in DWORD2.  */
+	or.	r7,r10,r11
+	bne	cr0, L(early_find)	/* If found null bytes.  */
+
+	/* At this point maxlen > 32 and null bytes were not found at first
+	   16 bytes. Prepare for loop using VMX.  */
+
+	/* r3 == s, r4 == maxlen. All other volatile regs are unused now.  */
+
+	addi	r5,r3,16	/* Align up, or just add the 16B we
+				   already checked.  */
+	li	r0,15
+	and	r7,r5,r0	/* Find offset into 16B alignment.  */
+	andc	r5,r5,r0	/* Quadword align up s to the next quadword.  */
+	li	r0,16
+	subf	r0,r7,r0
+	subf	r4,r0,r4	/* Deduct unaligned bytes from maxlen.  */
+
+
+	/* Compute offsets for vmx loads, and precompute the vbpermq
+	   constants for both the 64B and 16B loops.  */
+	li	r6,0
+	vspltisb  v0,0
+	vspltisb  v10,3
+	lvsl	  v11,r6,r6
+	vslb	  v10,v11,v10
+
+	cmpldi  r4,64		/* Check maxlen < 64.  */
+	blt	L(smaller)	/* If maxlen < 64 */
+
+	/* In order to begin the 64B loop, it needs to be 64
+	   bytes aligned. So read quadwords until it is aligned or found null
+	   bytes. At worst case it will be aligned after the fourth iteration,
+	   so unroll the loop to avoid counter checking.  */
+	andi.   r7,r5,63		/* Check if is 64 bytes aligned.  */
+	beq     cr0,L(preloop_64B)	/* If it is already 64B aligned.  */
+	lvx     v1,r5,r6
+	vcmpequb.       v1,v1,v0
+	addi    r5,r5,16
+	addi    r4,r4,-16		/* Decrement maxlen in 16 bytes. */
+	bne     cr6,L(found_aligning64B) /* If found null bytes.  */
+
+	/* Unroll 3x above code block until aligned or find null bytes.  */
+	andi.   r7,r5,63
+	beq     cr0,L(preloop_64B)
+	lvx     v1,r5,r6
+	vcmpequb.      v1,v1,v0
+	addi    r5,r5,16
+	addi    r4,r4,-16
+	bne     cr6,L(found_aligning64B)
+
+	andi.   r7,r5,63
+	beq     cr0,L(preloop_64B)
+	lvx     v1,r5,r6
+	vcmpequb.      v1,v1,v0
+	addi    r5,r5,16
+	addi    r4,r4,-16
+	bne     cr6,L(found_aligning64B)
+
+	andi.   r7,r5,63
+	beq     cr0,L(preloop_64B)
+	lvx     v1,r5,r6
+	vcmpequb.      v1,v1,v0
+	addi    r5,r5,16
+	addi    r4,r4,-16
+	bne     cr6,L(found_aligning64B)
+
+	/* At this point it should be 16 bytes aligned.
+	   Prepare for the 64B loop.  */
+	.p2align 4
+L(preloop_64B):
+	/* Check if maxlen became is less than 64, therefore disallowing the
+	   64B loop. If it happened switch to the 16B loop code.  */
+	cmpldi  r4,64		/* Check if maxlen < 64.  */
+	blt     L(smaller)	/* If maxlen < 64.  */
+	/* Set some constant values.  */
+	li      r7,16
+	li      r10,32
+	li      r9,48
+
+	/* Compute the number of 64 bytes iterations needed.  */
+	srdi	r11,r4,6	/* Compute loop count (maxlen / 64).  */
+	andi.	r4,r4,63	/* Set maxlen the remainder (maxlen % 64).  */
+	mtctr	r11		/* Move loop count to counter register.  */
+
+	/* Handle maxlen > 64. Loop over the bytes in strides of 64B.  */
+	.p2align 4
+L(loop_64B):
+	lvx	v1,r5,r6	/* r5 is the pointer to s.  */
+	lvx	v2,r5,r7
+	lvx	v3,r5,r10
+	lvx	v4,r5,r9
+	/* Compare the four 16B vectors to obtain the least 16 values.
+	   Null bytes should emerge into v7, then check for null bytes.  */
+	vminub	v5,v1,v2
+	vminub	v6,v3,v4
+	vminub	v7,v5,v6
+	vcmpequb. v7,v7,v0		/* Check for null bytes.  */
+	addi	r5,r5,64		/* Add pointer to next iteraction.  */
+	bne	cr6,L(found_64B)	/* If found null bytes.  */
+	bdnz	L(loop_64B)		/* Continue the loop if count > 0. */
+
+/* Hit loop end without null match. So branch to handle the remainder.  */
+
+	/* Prepare a 16B loop to handle two cases:
+		1. If 32 > maxlen < 64.
+		2. If maxlen >= 64, and reached end of the 64B loop with null
+		bytes not found. Thus handle the remainder bytes here. */
+	.p2align 4
+L(smaller):
+        cmpldi  r4,0            /* Check maxlen is zero.  */
+        beq     L(done)         /* If maxlen is zero.  */
+
+	/* Place rounded up number of qw's to check into a vmx
+	   register, and use some vector tricks to minimize
+	   branching.  */
+        MTVRD(v7,r4)            /* Copy maxlen from GPR to vector register. */
+        vspltisb v5,1
+        vspltisb v6,15
+        vspltb   v2,v7,7
+        vaddubs  v3,v5,v6
+
+#ifdef __LITTLE_ENDIAN__
+	vspltish v5,1           /* Compute 16 in each byte.  */
+#endif
+
+	/* Loop in 16B aligned incremements now. */
+	.p2align 4
+L(loop_16B):
+	lvx     v1,r5,r6        /* Load quadword into vector register.  */
+	addi    r5,r5,16        /* Increment address to next 16B block.  */
+	vor     v7,v2,v2        /* Save loop count (v2) into v7. */
+	vsububs v2,v2,v3        /* Subtract 16B from count, saturate at 0. */
+	vminub  v4,v1,v2
+	vcmpequb. v4,v4,v0      /* Checking for null bytes.  */
+	beq     cr6,L(loop_16B) /* If null bytes not found.  */
+
+	vcmpequb  v1,v1,v0
+	VBPERMQ(v1,v1,v10)
+#ifdef __LITTLE_ENDIAN__
+	vsubuhm  v2,v1,v5       /* Form a mask of trailing zeros.  */
+	vandc    v2,v2,v1
+	VPOPCNTH(v1,v2)         /* Count of trailing zeros, 16 if none.  */
+#else
+	VCLZH(v1,v1)            /* Count the leading zeros, 16 if none.  */
+#endif
+	/* Truncate to maximum allowable offset.  */
+	vcmpgtub v2,v1,v7       /* Compare and truncate for matches beyond
+				   maxlen.  */
+	vsel     v1,v1,v7,v2    /* 0-16 is now in byte 7.  */
+
+	MFVRD(r0,v1)
+	addi    r5,r5,-16       /* Undo speculative bump.  */
+	extsb   r0,r0           /* Clear whatever gunk is in the high 56b.  */
+	add     r5,r5,r0        /* Add the offset of whatever was found.  */
+L(done):
+	subf    r3,r3,r5        /* Length is equal to the offset of null byte
+				   matched minus the pointer to s.  */
+	blr                     /* Done.  */
+
+	/* Handle case of maxlen > 64 and found null bytes in last block
+	   of 64 bytes read.  */
+	.p2align 4
+L(found_64B):
+	/* A zero was found. Reduce the result.  */
+	vcmpequb  v1,v1,v0
+	vcmpequb  v2,v2,v0
+	vcmpequb  v3,v3,v0
+	vcmpequb  v4,v4,v0
+
+	/* Permute the first bit of each byte into bits 48-63.  */
+	VBPERMQ(v1,v1,v10)
+	VBPERMQ(v2,v2,v10)
+	VBPERMQ(v3,v3,v10)
+	VBPERMQ(v4,v4,v10)
+
+	/* Shift each component into its correct position for merging.  */
+#ifdef __LITTLE_ENDIAN__
+	vsldoi	v2,v2,v2,2
+	vsldoi	v3,v3,v3,4
+	vsldoi	v4,v4,v4,6
+#else
+	vsldoi	v1,v1,v1,6
+	vsldoi	v2,v2,v2,4
+	vsldoi	v3,v3,v3,2
+#endif
+
+	/* Merge the results and move to a GPR.  */
+	vor	v1,v2,v1
+	vor	v2,v3,v4
+	vor	v4,v1,v2
+
+	/* Adjust address to the start of the current 64B block.  */
+	addi	r5,r5,-64
+
+	MFVRD(r10,v4)
+#ifdef __LITTLE_ENDIAN__
+	addi	r9,r10,-1	/* Form a mask from trailing zeros.  */
+	andc	r9,r9,r10
+	popcntd	r0,r9		/* Count the bits in the mask.  */
+#else
+	cntlzd	r0,r10		/* Count leading zeros before the match.  */
+#endif
+	subf	r5,r3,r5
+	add	r3,r5,r0	/* Compute final length.  */
+	blr                     /* Done.  */
+
+	/* Handle case where null bytes were found while aligning
+	   as a preparation for the 64B loop.  */
+	.p2align 4
+L(found_aligning64B):
+	VBPERMQ(v1,v1,v10)
+#ifdef __LITTLE_ENDIAN__
+	MFVRD(r10,v1)
+	addi    r9,r10,-1       /* Form a mask from trailing zeros.  */
+	andc    r9,r9,r10
+	popcntd r0,r9           /* Count the bits in the mask.  */
+#else
+	vsldoi  v1,v1,v1,6
+	MFVRD(r10,v1)
+	cntlzd  r0,r10          /* Count leading zeros before the match.  */
+#endif
+	addi    r5,r5,-16	/* Adjust address to offset of last 16 bytes
+				   read.  */
+	/* Calculate length as subtracted the pointer to s of last 16 bytes
+	   offset, added with the bytes before the match.  */
+	subf    r5,r3,r5
+	add     r3,r5,r0
+	blr			/* Done.  */
+
+	/* Handle case of maxlen > 32 and found a null bytes within the first
+	   16 bytes of s.  */
+	.p2align 4
+L(early_find):
+	bpermd	r5,r8,r10        /* r8 contains the bit permute constants.  */
+	bpermd	r6,r8,r11
+	sldi	r5,r5,8
+	or	r5,r5,r6	/* r5 should hold a 16B mask of
+				   a potential 0.  */
+	cntlzd	r5,r5		/* Count leading zeros.  */
+	addi	r3,r5,-48	/* Deduct the 48 leading zeros always
+				   present.  */
+	blr			/* Done.  */
+
+	/* Handle case of maxlen <= 32. Use the POWER7 algorithm.  */
+	.p2align 4
+L(small_range):
+	clrrdi	r8,r3,3  	/* Align the pointer to 8B.  */
+	li	r0,0
+	/* Register's content at this point:
+	   r3 == pointer to s, r4 == maxlen, r8 == pointer to s aligned to 8B,
+	   r7 == last acceptable address. */
+	cmpldi	r4,0                 /* Check if maxlen is zero.  */
+	beq	L(end_max)	     /* If maxlen is zero.  */
+
+	/* Calculate the last acceptable address and check for possible
+	   addition overflow by using satured math:
+	   r7 = r3 + r4
+	   r7 |= -(r7 < x)  */
+	add     r7,r3,r4
+	subfc   r6,r3,r7
+	subfe   r9,r9,r9
+	extsw   r6,r9
+	or      r7,r7,r6
+	addi    r7,r7,-1
+
+	clrrdi	r7,r7,3              /* Align to 8B address of last
+					acceptable address.  */
+
+	rlwinm	r6,r3,3,26,28        /* Calculate padding.  */
+	ld	r12,0(r8)            /* Load aligned doubleword.  */
+	cmpb	r10,r12,r0           /* Check for null bytes. */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	sld	r10,r10,r6
+#else
+	sld	r10,r10,r6
+	srd	r10,r10,r6
+#endif /* __LITTLE_ENDIAN__  */
+	cmpldi	cr7,r10,0
+	bne	cr7,L(done_small)    /* If found null byte.  */
+
+	cmpld	r8,r7                /* Check if reached maxlen.  */
+	beq	L(end_max)	     /* If reached maxlen.  */
+
+	/* Still handling case of maxlen <= 32. Read doubleword aligned until
+	   find null bytes or reach maxlen.  */
+	.p2align 4
+L(loop_small):
+	ldu	r12,8(r8)         /* Load next doubleword and update r8.  */
+	cmpb	r10,r12,r0        /* Check for null bytes.  */
+	cmpldi	cr6,r10,0
+	bne	cr6,L(done_small) /* If found null bytes.  */
+	cmpld	r8,r7             /* Check if reached maxlen. */
+	bne	L(loop_small)	  /* If it has more bytes to read.  */
+	mr	r3,r4             /* Reached maxlen with null bytes not found.
+				     Length is equal to maxlen.  */
+	blr			  /* Done.  */
+
+	/* Still handling case of maxlen <= 32. Found null bytes.
+	   Registers: r10 == match bits within doubleword, r8 == address of
+	   last doubleword read, r3 == pointer to s, r4 == maxlen.  */
+	.p2align 4
+L(done_small):
+#ifdef __LITTLE_ENDIAN__
+	/* Count trailing zeros.  */
+	addi	r0,r10,-1
+	andc	r0,r0,r10
+	popcntd	r0,r0
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
+#endif
+	sub	r3,r8,r3      /* Calculate total of bytes before the match.  */
+	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
+	add	r3,r3,r0      /* Length until the match.  */
+	cmpld	r3,r4         /* Check length is greater than maxlen.  */
+	blelr
+	mr	r3,r4	      /* If length is greater than maxlen, return
+				 maxlen.  */
+	blr
+
+	/* Handle case of reached maxlen with null bytes not found.  */
+	.p2align 4
+L(end_max):
+	mr	r3,r4	/* Length is equal to maxlen.  */
+	blr		/* Done.  */
+
+
+END (__strnlen)
+libc_hidden_def (__strnlen)
+weak_alias (__strnlen, strnlen)
+libc_hidden_def (strnlen)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strrchr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strrchr.S
new file mode 100644
index 0000000000..8eb74853c3
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strrchr.S
@@ -0,0 +1,464 @@
+/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* char *[r3] strrchr (char *s [r3], int c [r4])  */
+/* TODO: change these to the actual instructions when the minimum required
+   binutils allows it.  */
+#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t,a,b)  .long (0x1000054c \
+				| ((t)<<(32-11)) \
+				| ((a)<<(32-16)) \
+				| ((b)<<(32-21)) )
+#define VCLZD(r,v) .long (0x100007c2 | ((r)<<(32-11)) | ((v)<<(32-21)))
+#define VPOPCNTD(r,v) .long (0x100007c3 | ((r)<<(32-11)) | ((v)<<(32-21)))
+#define VADDUQM(t,a,b)  .long (0x10000100 \
+				| ((t)<<(32-11)) \
+				| ((a)<<(32-16)) \
+				| ((b)<<(32-21)) )
+#ifdef __LITTLE_ENDIAN__
+/* Find the match position from v6 and place result in r6.  */
+# define CALCULATE_MATCH() \
+	VBPERMQ(v6, v6, v10); \
+	vsldoi	v6, v6, v6, 6; \
+	MFVRD(r7, v6); \
+	cntlzd	r6, r7; \
+	subfic	r6, r6, 15;
+/*
+ * Find the first null position to mask bytes after null.
+ * (reg): vcmpequb result: v2 for 1st qw v3 for 2nd qw.
+ * Result placed at v2.
+ */
+# define FIND_NULL_POS(reg) \
+	vspltisb	v11, -1; \
+	VADDUQM(v11, reg, v11); \
+	vandc	v11, v11, reg; \
+	VPOPCNTD(v2, v11); \
+	vspltb	v11, v2, 15; \
+	vcmpequb.	v11, v11, v9; \
+	blt	cr6, 1f; \
+	vsldoi	v9, v0, v9, 1; \
+	vslo	v2, v2, v9; \
+1: \
+	vsumsws	v2, v2, v0;
+#else
+# define CALCULATE_MATCH() \
+	VBPERMQ(v6, v6, v10); \
+	MFVRD(r7, v6); \
+	addi	r6, r7, -1; \
+	andc	r6, r6, r7; \
+	popcntd	r6, r6; \
+	subfic	r6, r6, 15;
+# define FIND_NULL_POS(reg) \
+	VCLZD(v2, reg); \
+	vspltb	v11, v2, 7; \
+	vcmpequb.	v11, v11, v9; \
+	blt	cr6, 1f; \
+	vsldoi	v9, v0, v9, 1; \
+	vsro	v2, v2, v9; \
+1: \
+	vsumsws	v2, v2, v0;
+#endif	/* !__LITTLE_ENDIAN__  */
+	.machine  power7
+ENTRY (strrchr)
+	CALL_MCOUNT 2
+	dcbt	0,r3
+	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
+	cmpdi	cr7,r4,0
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
+	li	r9,0	      /* Used to store last occurence.  */
+	li	r0,0	      /* Doubleword with null chars to use
+				 with cmpb.  */
+
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+
+	beq	cr7,L(null_match)
+
+	/* Replicate byte to doubleword.  */
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
+	insrdi	r4,r4,32,0
+
+	/* r4 is changed now.  If it's passed more chars, then
+	   check for null again.  */
+	cmpdi	cr7,r4,0
+	beq	cr7,L(null_match)
+	/* Now r4 has a doubleword of c bytes and r0 has
+	   a doubleword of null bytes.  */
+
+	cmpb	r10,r12,r4     /* Compare each byte against c byte.  */
+	cmpb	r11,r12,r0     /* Compare each byte against null byte.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+#else
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+#endif
+	or	r5,r10,r11    /* OR the results to speed things up.  */
+	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done)
+
+L(align):
+	andi.	r12, r8, 15
+
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bne	cr0, L(loop)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r10,r12,r4
+	cmpb	r11,r12,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done)
+	b	L(loop)	      /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+	.p2align  5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu	r7,16(r8)
+	cmpb	r10,r12,r4
+	cmpb	r11,r12,r0
+	cmpb	r6,r7,r4
+	cmpb	r7,r7,r0
+	or	r12,r10,r11
+	or	r5,r6,r7
+	or	r5,r12,r5
+	cmpdi	cr7,r5,0
+	beq	cr7,L(vector)
+
+	/* OK, one (or both) of the doublewords contains a c/null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a c/null byte.  */
+	cmpdi	cr6,r12,0
+	addi	r8,r8,-8
+	bne	cr6,L(done)
+
+	/* The c/null byte must be in the second doubleword.  Adjust the
+	   address again and move the result of cmpb to r10 so we can calculate
+	   the pointer.  */
+
+	mr	r10,r6
+	mr	r11,r7
+	addi	r8,r8,8
+
+	/* r10/r11 have the output of the cmpb instructions, that is,
+	   0xff in the same position as the c/null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+
+L(done):
+	/* If there are more than one 0xff in r11, find the first position of
+	   0xff in r11 and fill r10 with 0 from that position.  */
+	cmpdi	cr7,r11,0
+	beq	cr7,L(no_null)
+#ifdef __LITTLE_ENDIAN__
+	addi	r3,r11,-1
+	andc	r3,r3,r11
+	popcntd r0,r3
+#else
+	cntlzd	r0,r11
+#endif
+	subfic	r0,r0,63
+	li	r6,-1
+#ifdef __LITTLE_ENDIAN__
+	srd	r0,r6,r0
+#else
+	sld	r0,r6,r0
+#endif
+	and	r10,r0,r10
+L(no_null):
+#ifdef __LITTLE_ENDIAN__
+	cntlzd	r0,r10		/* Count leading zeros before c matches.  */
+	addi	r3,r10,-1
+	andc	r3,r3,r10
+	addi	r10,r11,-1
+	andc	r10,r10,r11
+	cmpld	cr7,r3,r10
+	bgt	cr7,L(no_match)
+#else
+	addi	r3,r10,-1	/* Count trailing zeros before c matches.  */
+	andc	r3,r3,r10
+	popcntd	r0,r3
+	cmpld	cr7,r11,r10
+	bgt	cr7,L(no_match)
+#endif
+	srdi	r0,r0,3		/* Convert trailing zeros to bytes.  */
+	subfic	r0,r0,7
+	add	r9,r8,r0      /* Return address of the matching c byte
+				 or null in case c was not found.  */
+	li	r0,0
+	cmpdi	cr7,r11,0     /* If r11 == 0, no null's have been found.  */
+	beq	cr7,L(align)
+
+	.align	4
+L(no_match):
+	mr	r3,r9
+	blr
+
+/* Check the first 32B in GPR's and move to vectorized loop.  */
+	.p2align  5
+L(vector):
+	addi	r3, r8, 8
+	/* Make sure 32B aligned.  */
+	andi.	r10, r3, 31
+	bne	cr0, L(loop)
+	vspltisb	v0, 0
+	/* Precompute vbpermq constant.  */
+	vspltisb	v10, 3
+	lvsl	v11, r0, r0
+	vslb	v10, v11, v10
+	MTVRD(v1, r4)
+	li	r5, 16
+	vspltb	v1, v1, 7
+	/* Compare 32 bytes in each loop.  */
+L(continue):
+	lvx	v4, 0, r3
+	lvx	v5, r3, r5
+	vcmpequb	v2, v0, v4
+	vcmpequb	v3, v0, v5
+	vcmpequb	v6, v1, v4
+	vcmpequb	v7, v1, v5
+	vor	v8, v2, v3
+	vor	v9, v6, v7
+	vor	v11, v8, v9
+	vcmpequb.	v11, v0, v11
+	addi	r3, r3, 32
+	blt	cr6, L(continue)
+	vcmpequb.	v8, v0, v8
+	blt	cr6, L(match)
+
+	/* One (or both) of the quadwords contains c/null.  */
+	vspltisb	v8, 2
+	vspltisb	v9, 5
+	/* Precompute values used for comparison.  */
+	vsl	v9, v8, v9	/* v9 = 0x4040404040404040.  */
+	vaddubm	v8, v9, v9
+	vsldoi	v8, v0, v8, 1	/* v8 = 0x80.  */
+
+	/* Check if null is in second qw.  */
+	vcmpequb.	v11, v0, v2
+	blt	cr6, L(secondqw)
+
+	/* Null found in first qw.  */
+	addi	r8, r3, -32
+	/* Calculate the null position.  */
+	FIND_NULL_POS(v2)
+	/* Check if null is in the first byte.  */
+	vcmpequb.	v11, v0, v2
+	blt	cr6, L(no_match)
+	vsububm	v2, v8, v2
+	/* Mask unwanted bytes after null.  */
+#ifdef __LITTLE_ENDIAN__
+	vslo	v6, v6, v2
+	vsro	v6, v6, v2
+#else
+	vsro	v6, v6, v2
+	vslo	v6, v6, v2
+#endif
+	vcmpequb.	v11, v0, v6
+	blt	cr6, L(no_match)
+	/* Found a match before null.  */
+	CALCULATE_MATCH()
+	add	r3, r8, r6
+	blr
+
+L(secondqw):
+	addi	r8, r3, -16
+	FIND_NULL_POS(v3)
+	vcmpequb.	v11, v0, v2
+	blt	cr6, L(no_match1)
+	vsububm	v2, v8, v2
+	/* Mask unwanted bytes after null.  */
+#ifdef __LITTLE_ENDIAN__
+	vslo	v7, v7, v2
+	vsro	v7, v7, v2
+#else
+	vsro	v7, v7, v2
+	vslo	v7, v7, v2
+#endif
+	vcmpequb.	v11, v0, v7
+	blt	cr6, L(no_match1)
+	addi	r8, r8, 16
+	vor	v6, v0, v7
+L(no_match1):
+	addi	r8, r8, -16
+	vcmpequb.	v11, v0, v6
+	blt	cr6, L(no_match)
+	/* Found a match before null.  */
+	CALCULATE_MATCH()
+	add	r3, r8, r6
+	blr
+
+L(match):
+	/* One (or both) of the quadwords contains a match.  */
+	mr	r8, r3
+	vcmpequb.	v8, v0, v7
+	blt	cr6, L(firstqw)
+	/* Match found in second qw.  */
+	addi	r8, r8, 16
+	vor	v6, v0, v7
+L(firstqw):
+	addi	r8, r8, -32
+	CALCULATE_MATCH()
+	add	r9, r8, r6      /* Compute final length.  */
+	b	L(continue)
+/* We are here because strrchr was called with a null byte.  */
+	.align	4
+L(null_match):
+	/* r0 has a doubleword of null bytes.  */
+
+	cmpb	r5,r12,r0     /* Compare each byte against null bytes.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r5,r5,r6
+	sld	r5,r5,r6
+#else
+	sld	r5,r5,r6
+	srd	r5,r5,r6
+#endif
+	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done_null)
+
+	andi.	r12, r8, 15
+
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bne	cr0, L(loop_null)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r5,r12,r0
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done_null)
+	b	L(loop_null)  /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	/* Main loop to look for the end of the string.  Since it's a
+	   small loop (< 8 instructions), align it to 32-bytes.  */
+	.p2align  5
+L(loop_null):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu	r11,16(r8)
+	cmpb	r5,r12,r0
+	cmpb	r10,r11,r0
+	or	r6,r5,r10
+	cmpdi	cr7,r6,0
+	beq	cr7,L(vector1)
+
+	/* OK, one (or both) of the doublewords contains a null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a null byte.  */
+
+	cmpdi	cr6,r5,0
+	addi	r8,r8,-8
+	bne	cr6,L(done_null)
+
+	/* The null byte must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   pointer.  */
+
+	mr	r5,r10
+	addi	r8,r8,8
+
+	/* r5 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+L(done_null):
+#ifdef __LITTLE_ENDIAN__
+	addi	r0,r5,-1
+	andc	r0,r0,r5
+	popcntd	r0,r0
+#else
+	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
+#endif
+	srdi	r0,r0,3	      /* Convert trailing zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of the matching null byte.  */
+	blr
+/* Check the first 32B in GPR's and move to vectorized loop.  */
+	.p2align  5
+L(vector1):
+	addi	r3, r8, 8
+	/* Make sure 32B aligned.  */
+	andi.	r10, r3, 31
+	bne	cr0, L(loop_null)
+	vspltisb	v0, 0
+	/* Precompute vbpermq constant.  */
+	vspltisb	v10, 3
+	lvsl	v11, r0, r0
+	vslb	v10, v11, v10
+	li	r5, 16
+	/* Compare 32 bytes in each loop.  */
+L(continue1):
+	lvx	v4, 0, r3
+	lvx	v5, r3, r5
+	vcmpequb	v2, v0, v4
+	vcmpequb	v3, v0, v5
+	vor	v8, v2, v3
+	vcmpequb.	v11, v0, v8
+	addi	r3, r3, 32
+	blt	cr6, L(continue1)
+	addi	r3, r3, -32
+	VBPERMQ(v2, v2, v10)
+	VBPERMQ(v3, v3, v10)
+	/* Shift each component into its correct position for merging.  */
+#ifdef __LITTLE_ENDIAN__
+	vsldoi	v3, v3, v3, 2
+#else
+	vsldoi	v2, v2, v2, 6
+	vsldoi	v3, v3, v3, 4
+#endif
+	/* Merge the results and move to a GPR.  */
+	vor	v4, v3, v2
+	MFVRD(r5, v4)
+#ifdef __LITTLE_ENDIAN__
+	addi	r6, r5, -1
+	andc	r6, r6, r5
+	popcntd	r6, r6
+#else
+	cntlzd	r6, r5  /* Count leading zeros before the match.  */
+#endif
+	add	r3, r3, r6      /* Compute final length.  */
+	blr
+END (strrchr)
+weak_alias (strrchr, rindex)
+libc_hidden_builtin_def (strrchr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strspn.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strspn.S
new file mode 100644
index 0000000000..e9271898f2
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strspn.S
@@ -0,0 +1,202 @@
+/* Optimized strspn implementation for Power8.
+
+   Copyright (C) 2016-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* size_t [r3] strspn (const char *string [r3],
+                       const char *needleAccept [r4])  */
+
+/* This takes a novel approach by computing a 256 bit mask whereby
+   each set bit implies the byte is "accepted".  P8 vector hardware
+   has extremely efficient hardware for selecting bits from a mask.
+
+   One might ask "why not use bpermd for short strings"?  It is
+   so slow that its performance about matches the generic PPC64
+   variant without any fancy masking, with the added expense of
+   making the mask.  That was the first variant of this.  */
+
+
+
+#include "sysdep.h"
+
+#ifndef USE_AS_STRCSPN
+#  define USE_AS_STRCSPN 0
+#  ifndef STRSPN
+#    define STRSPN strspn
+#  endif
+#  define INITIAL_MASK 0
+#  define UPDATE_MASK(RA, RS, RB) or	RA, RS, RB
+#else
+#  ifndef STRSPN
+#    define STRSPN strcspn
+#  endif
+#  define INITIAL_MASK -1
+#  define UPDATE_MASK(RA, RS, RB) andc	RA, RS, RB
+#endif
+
+/* Simple macro to use VSX instructions in overlapping VR's.  */
+#define XXVR(insn, vrt, vra, vrb) \
+	insn 32+vrt, 32+vra, 32+vrb
+
+/* ISA 2.07B instructions are not all defined for older binutils.
+   Macros are defined below for these newer instructions in order
+   to maintain compatibility.  */
+
+/* Note, TX/SX is always set as VMX regs are the high 32 VSX regs.  */
+#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+
+#define VBPERMQ(t,a,b) .long (0x1000054c \
+			      | ((t)<<(32-11))	\
+			      | ((a)<<(32-16))	\
+			      | ((b)<<(32-21)) )
+
+	/* This can be updated to power8 once the minimum version of
+	   binutils supports power8 and the above instructions.  */
+	.machine power7
+EALIGN(STRSPN, 4, 0)
+	CALL_MCOUNT 2
+
+	/* Generate useful constants for later on.  */
+	vspltisb v1, 7
+	vspltisb v2, -1
+	vslb	v1, v1, v1	/* 0x80 to swap high bit for vbpermq.  */
+	vspltisb v10, 0
+	vsldoi	v4, v10, v2, 2	/* 0xFFFF into vr4.  */
+	XXVR(xxmrgld, v4, v4, v10) /* Mask for checking matches.  */
+
+	/* Prepare to compute 256b mask.  */
+	addi	r4, r4, -1
+	li	r5, INITIAL_MASK
+	li	r6, INITIAL_MASK
+	li	r7, INITIAL_MASK
+	li	r8, INITIAL_MASK
+
+#if USE_AS_STRCSPN
+	/* Ensure the null character never matches by clearing ISA bit 0 in
+	   in r5 which is the bit which will check for it in the later usage
+	   of vbpermq.  */
+	srdi	r5, r5, 1
+#endif
+
+	li	r11, 1
+	sldi	r11, r11, 63
+
+	/* Start interleaved Mask computation.
+	   This will eventually or 1's into ignored bits from vbpermq.  */
+	lvsr	v11, 0, r3
+	vspltb  v11, v11, 0	/* Splat shift constant.  */
+
+	/* Build a 256b mask in r5-r8.  */
+	.align 4
+L(next_needle):
+	lbzu	r9, 1(r4)
+
+	cmpldi	cr0, r9, 0
+	cmpldi	cr1, r9, 128
+
+	/* This is a little tricky.  srd only uses the first 7 bits,
+	   and if bit 7 is set, value is always 0.  So, we can
+	   effectively shift 128b in this case.  */
+	xori	r12, r9,  0x40	/* Invert bit 6.  */
+	srd	r10, r11, r9	/* Mask for bits 0-63.  */
+	srd	r12, r11, r12	/* Mask for bits 64-127.  */
+
+	beq	cr0, L(start_cmp)
+
+	/* Now, or the value into the correct GPR.  */
+	bge cr1,L(needle_gt128)
+	UPDATE_MASK (r5, r5, r10)	/* 0 - 63.  */
+	UPDATE_MASK (r6, r6, r12)	/* 64 - 127.  */
+	b L(next_needle)
+
+	.align 4
+L(needle_gt128):
+	UPDATE_MASK (r7, r7, r10)	/* 128 - 191.  */
+	UPDATE_MASK (r8, r8, r12)	/* 192 - 255.  */
+	b L(next_needle)
+
+
+	.align 4
+L(start_cmp):
+	/* Move and merge bitmap into 2 VRs.  bpermd is slower on P8.  */
+	mr	r0, r3		/* Save r3 for final length computation.  */
+	MTVRD (v5, r5)
+	MTVRD (v6, r6)
+	MTVRD (v7, r7)
+	MTVRD (v8, r8)
+
+	/* Continue interleaved mask generation.  */
+#ifdef __LITTLE_ENDIAN__
+	vsrw	v11, v2, v11	/* Note, shift ignores higher order bits.  */
+	vsplth  v11, v11, 0	/* Only care about the high 16 bits of v10.  */
+#else
+	vslw	v11, v2, v11	/* Note, shift ignores higher order bits.  */
+	vsplth  v11, v11, 1	/* Only care about the low 16 bits of v10.  */
+#endif
+	lvx	v0, 0, r3	/* Note, unaligned load ignores lower bits.  */
+
+	/* Do the merging of the bitmask.  */
+	XXVR(xxmrghd, v5, v5, v6)
+	XXVR(xxmrghd, v6, v7, v8)
+
+	/* Finish mask generation.  */
+	vand	v11, v11, v4	/* Throwaway bits not in the mask.  */
+
+	/* Compare the first 1-16B, while masking unwanted bytes.  */
+	clrrdi  r3, r3, 4	/* Note,  counts from qw boundaries.  */
+	vxor	v9, v0, v1	/* Swap high bit.  */
+	VBPERMQ (v8, v5, v0)
+	VBPERMQ (v7, v6, v9)
+	vor	v7, v7, v8
+	vor	v7, v7, v11	/* Ignore non-participating bytes.  */
+	vcmpequh. v8, v7, v4
+	bnl	cr6, L(done)
+
+	addi	r3, r3, 16
+
+	.align 4
+L(vec):
+	lvx	v0, 0, r3
+	addi	r3, r3, 16
+	vxor	v9, v0, v1	/* Swap high bit.  */
+	VBPERMQ (v8, v5, v0)
+	VBPERMQ (v7, v6, v9)
+	vor	v7, v7, v8
+	vcmpequh. v8, v7, v4
+	blt	cr6, L(vec)
+
+	addi	r3, r3, -16
+L(done):
+	subf	r3, r0, r3
+	MFVRD (r10, v7)
+
+#ifdef __LITTLE_ENDIAN__
+	addi	r0,  r10, 1	/* Count the trailing 1's.  */
+	andc	r10, r10, r0
+	popcntd	r10, r10
+#else
+	xori	r10, r10, 0xffff /* Count leading 1's by inverting.  */
+	addi	r3,  r3,  -48	/* Account for the extra leading zeros.  */
+	cntlzd  r10, r10
+#endif
+
+	add	r3, r3, r10
+	blr
+
+END(STRSPN)
+libc_hidden_builtin_def (STRSPN)