diff options
Diffstat (limited to 'REORG.TODO/sysdeps/powerpc/powerpc64/power8')
35 files changed, 7733 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/Implies new file mode 100644 index 0000000000..9a5e3c7277 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/Implies @@ -0,0 +1,2 @@ +powerpc/powerpc64/power7/fpu +powerpc/powerpc64/power7 diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/Makefile b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/Makefile new file mode 100644 index 0000000000..71a59529f3 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/Makefile @@ -0,0 +1,3 @@ +ifeq ($(subdir),string) +sysdep_routines += strcasestr-ppc64 +endif diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/Implies new file mode 100644 index 0000000000..1187cdfb0a --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/Implies @@ -0,0 +1 @@ +powerpc/powerpc64/power7/fpu/ diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S new file mode 100644 index 0000000000..4c42926a74 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S @@ -0,0 +1,303 @@ +/* Optimized expf(). PowerPC64/POWER8 version. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Short algorithm description: + * + * Let K = 64 (table size). + * e^x = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y)) + * where: + * x = m*log(2)/K + y, y in [0.0..log(2)/K] + * m = n*K + j, m,n,j - signed integer, j in [0..K-1] + * values of 2^(j/K) are tabulated as T[j]. + * + * P(y) is a minimax polynomial approximation of expf(y)-1 + * on small interval [0.0..log(2)/K]. + * + * P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as + * z = y*y; P(y) = (P3*z + P1)*z + (P2*z + P0)*y + * + * Special cases: + * expf(NaN) = NaN + * expf(+INF) = +INF + * expf(-INF) = 0 + * expf(x) = 1 for subnormals + * for finite argument, only expf(0)=1 is exact + * expf(x) overflows if x>88.7228317260742190 + * expf(x) underflows if x<-103.972076416015620 + */ + +#define C1 0x42ad496b /* Single precision 125*log(2). */ +#define C2 0x31800000 /* Single precision 2^(-28). */ +#define SP_INF 0x7f800000 /* Single precision Inf. */ +#define SP_EXP_BIAS 0x1fc0 /* Single precision exponent bias. */ + +#define DATA_OFFSET r9 + +/* Implements the function + + float [fp1] expf (float [fp1] x) */ + + .machine power8 +EALIGN(__ieee754_expf, 4, 0) + addis DATA_OFFSET,r2,.Lanchor@toc@ha + addi DATA_OFFSET,DATA_OFFSET,.Lanchor@toc@l + + xscvdpspn v0,v1 + mfvsrd r8,v0 /* r8 = x */ + lfd fp2,(.KLN2-.Lanchor)(DATA_OFFSET) + lfd fp3,(.P2-.Lanchor)(DATA_OFFSET) + rldicl r3,r8,32,33 /* r3 = |x| */ + lis r4,C1@ha /* r4 = 125*log(2) */ + ori r4,r4,C1@l + cmpw r3,r4 + lfd fp5,(.P3-.Lanchor)(DATA_OFFSET) + lfd fp4,(.RS-.Lanchor)(DATA_OFFSET) + fmadd fp2,fp1,fp2,fp4 /* fp2 = x * K/log(2) + (2^23 + 2^22) */ + bge L(special_paths) /* |x| >= 125*log(2) ? */ + + lis r4,C2@ha + ori r4,r4,C2@l + cmpw r3,r4 + blt L(small_args) /* |x| < 2^(-28) ? */ + + /* Main path: here if 2^(-28) <= |x| < 125*log(2) */ + frsp fp6,fp2 + xscvdpsp v2,v2 + mfvsrd r8,v2 + mr r3,r8 /* r3 = m */ + rldicl r8,r8,32,58 /* r8 = j */ + lfs fp4,(.SP_RS-.Lanchor)(DATA_OFFSET) + fsubs fp2,fp6,fp4 /* fp2 = m = x * K/log(2) */ + srdi r3,r3,32 + clrrwi r3,r3,6 /* r3 = n */ + lfd fp6,(.NLN2K-.Lanchor)(DATA_OFFSET) + fmadd fp0,fp2,fp6,fp1 /* fp0 = y = x - m*log(2)/K */ + fmul fp2,fp0,fp0 /* fp2 = z = y^2 */ + lfd fp4,(.P1-.Lanchor)(DATA_OFFSET) + lfd fp6,(.P0-.Lanchor)(DATA_OFFSET) + lis r4,SP_EXP_BIAS@ha + ori r4,r4,SP_EXP_BIAS@l + add r3,r3,r4 + rldic r3,r3,49,1 /* r3 = 2^n */ + fmadd fp4,fp5,fp2,fp4 /* fp4 = P3 * z + P1 */ + fmadd fp6,fp3,fp2,fp6 /* fp6 = P2 * z + P0 */ + mtvsrd v1,r3 + xscvspdp v1,v1 + fmul fp4,fp4,fp2 /* fp4 = (P3 * z + P1)*z */ + fmadd fp0,fp0,fp6,fp4 /* fp0 = P(y) */ + sldi r8,r8,3 /* Access doublewords from T[j]. */ + addi r6,DATA_OFFSET,(.Ttable-.Lanchor) + lfdx fp3,r6,r8 + fmadd fp0,fp0,fp3,fp3 /* fp0 = T[j] * (1 + P(y)) */ + fmul fp1,fp1,fp0 /* fp1 = 2^n * T[j] * (1 + P(y)) */ + frsp fp1,fp1 + blr + + .align 4 +/* x is either underflow, overflow, infinite or NaN. */ +L(special_paths): + srdi r8,r8,32 + rlwinm r8,r8,3,29,29 /* r8 = 0, if x positive. + r8 = 4, otherwise. */ + addi r6,DATA_OFFSET,(.SPRANGE-.Lanchor) + lwzx r4,r6,r8 /* r4 = .SPRANGE[signbit(x)] */ + cmpw r3,r4 + /* |x| <= .SPRANGE[signbit(x)] */ + ble L(near_under_or_overflow) + + lis r4,SP_INF@ha + ori r4,r4,SP_INF@l + cmpw r3,r4 + bge L(arg_inf_or_nan) /* |x| > Infinite ? */ + + addi r6,DATA_OFFSET,(.SPLARGE_SMALL-.Lanchor) + lfsx fp1,r6,r8 + fmuls fp1,fp1,fp1 + blr + + + .align 4 +L(small_args): + /* expf(x) = 1.0, where |x| < |2^(-28)| */ + lfs fp2,(.SPone-.Lanchor)(DATA_OFFSET) + fadds fp1,fp1,fp2 + blr + + + .align 4 +L(arg_inf_or_nan:) + bne L(arg_nan) + + /* expf(+INF) = +INF + expf(-INF) = 0 */ + addi r6,DATA_OFFSET,(.INF_ZERO-.Lanchor) + lfsx fp1,r6,r8 + blr + + + .align 4 +L(arg_nan): + /* expf(NaN) = NaN */ + fadd fp1,fp1,fp1 + frsp fp1,fp1 + blr + + .align 4 +L(near_under_or_overflow): + frsp fp6,fp2 + xscvdpsp v2,v2 + mfvsrd r8,v2 + mr r3,r8 /* r3 = m */ + rldicl r8,r8,32,58 /* r8 = j */ + lfs fp4,(.SP_RS-.Lanchor)(DATA_OFFSET) + fsubs fp2,fp6,fp4 /* fp2 = m = x * K/log(2) */ + srdi r3,r3,32 + clrrwi r3,r3,6 /* r3 = n */ + lfd fp6,(.NLN2K-.Lanchor)(DATA_OFFSET) + fmadd fp0,fp2,fp6,fp1 /* fp0 = y = x - m*log(2)/K */ + fmul fp2,fp0,fp0 /* fp2 = z = y^2 */ + lfd fp4,(.P1-.Lanchor)(DATA_OFFSET) + lfd fp6,(.P0-.Lanchor)(DATA_OFFSET) + ld r4,(.DP_EXP_BIAS-.Lanchor)(DATA_OFFSET) + add r3,r3,r4 + rldic r3,r3,46,1 /* r3 = 2 */ + fmadd fp4,fp5,fp2,fp4 /* fp4 = P3 * z + P1 */ + fmadd fp6,fp3,fp2,fp6 /* fp6 = P2 * z + P0 */ + mtvsrd v1,r3 + fmul fp4,fp4,fp2 /* fp4 = (P3*z + P1)*z */ + fmadd fp0,fp0,fp6,fp4 /* fp0 = P(y) */ + sldi r8,r8,3 /* Access doublewords from T[j]. */ + addi r6,DATA_OFFSET,(.Ttable-.Lanchor) + lfdx fp3,r6,r8 + fmadd fp0,fp0,fp3,fp3 /* fp0 = T[j] * (1 + T[j]) */ + fmul fp1,fp1,fp0 /* fp1 = 2^n * T[j] * (1 + T[j]) */ + frsp fp1,fp1 + blr +END(__ieee754_expf) + + .section .rodata, "a",@progbits +.Lanchor: + .balign 8 +/* Table T[j] = 2^(j/K). Double precision. */ +.Ttable: + .8byte 0x3ff0000000000000 + .8byte 0x3ff02c9a3e778061 + .8byte 0x3ff059b0d3158574 + .8byte 0x3ff0874518759bc8 + .8byte 0x3ff0b5586cf9890f + .8byte 0x3ff0e3ec32d3d1a2 + .8byte 0x3ff11301d0125b51 + .8byte 0x3ff1429aaea92de0 + .8byte 0x3ff172b83c7d517b + .8byte 0x3ff1a35beb6fcb75 + .8byte 0x3ff1d4873168b9aa + .8byte 0x3ff2063b88628cd6 + .8byte 0x3ff2387a6e756238 + .8byte 0x3ff26b4565e27cdd + .8byte 0x3ff29e9df51fdee1 + .8byte 0x3ff2d285a6e4030b + .8byte 0x3ff306fe0a31b715 + .8byte 0x3ff33c08b26416ff + .8byte 0x3ff371a7373aa9cb + .8byte 0x3ff3a7db34e59ff7 + .8byte 0x3ff3dea64c123422 + .8byte 0x3ff4160a21f72e2a + .8byte 0x3ff44e086061892d + .8byte 0x3ff486a2b5c13cd0 + .8byte 0x3ff4bfdad5362a27 + .8byte 0x3ff4f9b2769d2ca7 + .8byte 0x3ff5342b569d4f82 + .8byte 0x3ff56f4736b527da + .8byte 0x3ff5ab07dd485429 + .8byte 0x3ff5e76f15ad2148 + .8byte 0x3ff6247eb03a5585 + .8byte 0x3ff6623882552225 + .8byte 0x3ff6a09e667f3bcd + .8byte 0x3ff6dfb23c651a2f + .8byte 0x3ff71f75e8ec5f74 + .8byte 0x3ff75feb564267c9 + .8byte 0x3ff7a11473eb0187 + .8byte 0x3ff7e2f336cf4e62 + .8byte 0x3ff82589994cce13 + .8byte 0x3ff868d99b4492ed + .8byte 0x3ff8ace5422aa0db + .8byte 0x3ff8f1ae99157736 + .8byte 0x3ff93737b0cdc5e5 + .8byte 0x3ff97d829fde4e50 + .8byte 0x3ff9c49182a3f090 + .8byte 0x3ffa0c667b5de565 + .8byte 0x3ffa5503b23e255d + .8byte 0x3ffa9e6b5579fdbf + .8byte 0x3ffae89f995ad3ad + .8byte 0x3ffb33a2b84f15fb + .8byte 0x3ffb7f76f2fb5e47 + .8byte 0x3ffbcc1e904bc1d2 + .8byte 0x3ffc199bdd85529c + .8byte 0x3ffc67f12e57d14b + .8byte 0x3ffcb720dcef9069 + .8byte 0x3ffd072d4a07897c + .8byte 0x3ffd5818dcfba487 + .8byte 0x3ffda9e603db3285 + .8byte 0x3ffdfc97337b9b5f + .8byte 0x3ffe502ee78b3ff6 + .8byte 0x3ffea4afa2a490da + .8byte 0x3ffefa1bee615a27 + .8byte 0x3fff50765b6e4540 + .8byte 0x3fffa7c1819e90d8 + +.KLN2: + .8byte 0x40571547652b82fe /* Double precision K/log(2). */ + +/* Double precision polynomial coefficients. */ +.P0: + .8byte 0x3fefffffffffe7c6 +.P1: + .8byte 0x3fe00000008d6118 +.P2: + .8byte 0x3fc55550da752d4f +.P3: + .8byte 0x3fa56420eb78fa85 + +.RS: + .8byte 0x4168000000000000 /* Double precision 2^23 + 2^22. */ +.NLN2K: + .8byte 0xbf862e42fefa39ef /* Double precision -log(2)/K. */ +.DP_EXP_BIAS: + .8byte 0x000000000000ffc0 /* Double precision exponent bias. */ + + .balign 4 +.SPone: + .4byte 0x3f800000 /* Single precision 1.0. */ +.SP_RS: + .4byte 0x4b400000 /* Single precision 2^23 + 2^22. */ + +.SPRANGE: /* Single precision overflow/underflow bounds. */ + .4byte 0x42b17217 /* if x>this bound, then result overflows. */ + .4byte 0x42cff1b4 /* if x<this bound, then result underflows. */ + +.SPLARGE_SMALL: + .4byte 0x71800000 /* 2^100. */ + .4byte 0x0d800000 /* 2^-100. */ + +.INF_ZERO: + .4byte 0x7f800000 /* Single precision Inf. */ + .4byte 0 /* Single precision zero. */ + +strong_alias (__ieee754_expf, __expf_finite) diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies new file mode 100644 index 0000000000..7fd86fdf87 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies @@ -0,0 +1 @@ +powerpc/powerpc64/power7/fpu/multiarch diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S new file mode 100644 index 0000000000..8dfa0076e0 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S @@ -0,0 +1,508 @@ +/* Optimized cosf(). PowerPC64/POWER8 version. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#define _ERRNO_H 1 +#include <bits/errno.h> + +#define FRAMESIZE (FRAME_MIN_SIZE+16) + +#define FLOAT_EXPONENT_SHIFT 23 +#define FLOAT_EXPONENT_BIAS 127 +#define INTEGER_BITS 3 + +#define PI_4 0x3f490fdb /* PI/4 */ +#define NINEPI_4 0x40e231d6 /* 9 * PI/4 */ +#define TWO_PN5 0x3d000000 /* 2^-5 */ +#define TWO_PN27 0x32000000 /* 2^-27 */ +#define INFINITY 0x7f800000 +#define TWO_P23 0x4b000000 /* 2^23 */ +#define FX_FRACTION_1_28 0x9249250 /* 0x100000000 / 28 + 1 */ + + /* Implements the function + + float [fp1] cosf (float [fp1] x) */ + + .machine power8 +EALIGN(__cosf, 4, 0) + addis r9,r2,L(anchor)@toc@ha + addi r9,r9,L(anchor)@toc@l + + lis r4,PI_4@h + ori r4,r4,PI_4@l + + xscvdpspn v0,v1 + mfvsrd r8,v0 + rldicl r3,r8,32,33 /* Remove sign bit. */ + + cmpw r3,r4 + bge L(greater_or_equal_pio4) + + lis r4,TWO_PN5@h + ori r4,r4,TWO_PN5@l + + cmpw r3,r4 + blt L(less_2pn5) + + /* Chebyshev polynomial of the form: + * 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */ + + lfd fp9,(L(C0)-L(anchor))(r9) + lfd fp10,(L(C1)-L(anchor))(r9) + lfd fp11,(L(C2)-L(anchor))(r9) + lfd fp12,(L(C3)-L(anchor))(r9) + lfd fp13,(L(C4)-L(anchor))(r9) + + fmul fp2,fp1,fp1 /* x^2 */ + lfd fp3,(L(DPone)-L(anchor))(r9) + + fmadd fp4,fp2,fp13,fp12 /* C3+x^2*C4 */ + fmadd fp4,fp2,fp4,fp11 /* C2+x^2*(C3+x^2*C4) */ + fmadd fp4,fp2,fp4,fp10 /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */ + fmadd fp4,fp2,fp4,fp9 /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */ + fmadd fp1,fp2,fp4,fp3 /* 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */ + frsp fp1,fp1 /* Round to single precision. */ + + blr + + .balign 16 +L(greater_or_equal_pio4): + lis r4,NINEPI_4@h + ori r4,r4,NINEPI_4@l + cmpw r3,r4 + bge L(greater_or_equal_9pio4) + + /* Calculate quotient of |x|/(PI/4). */ + lfd fp2,(L(invpio4)-L(anchor))(r9) + fabs fp1,fp1 /* |x| */ + fmul fp2,fp1,fp2 /* |x|/(PI/4) */ + fctiduz fp2,fp2 + mfvsrd r3,v2 /* n = |x| mod PI/4 */ + + /* Now use that quotient to find |x| mod (PI/2). */ + addi r7,r3,1 + rldicr r5,r7,2,60 /* ((n+1) >> 1) << 3 */ + addi r6,r9,(L(pio2_table)-L(anchor)) + lfdx fp4,r5,r6 + fsub fp1,fp1,fp4 + + .balign 16 +L(reduced): + /* Now we are in the range -PI/4 to PI/4. */ + + /* Work out if we are in a positive or negative primary interval. */ + addi r7,r7,2 + rldicl r4,r7,62,63 /* ((n+3) >> 2) & 1 */ + + /* Load a 1.0 or -1.0. */ + addi r5,r9,(L(ones)-L(anchor)) + sldi r4,r4,3 + lfdx fp0,r4,r5 + + /* Are we in the primary interval of sin or cos? */ + andi. r4,r7,0x2 + bne L(cos) + + /* Chebyshev polynomial of the form: + x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */ + + lfd fp9,(L(S0)-L(anchor))(r9) + lfd fp10,(L(S1)-L(anchor))(r9) + lfd fp11,(L(S2)-L(anchor))(r9) + lfd fp12,(L(S3)-L(anchor))(r9) + lfd fp13,(L(S4)-L(anchor))(r9) + + fmul fp2,fp1,fp1 /* x^2 */ + fmul fp3,fp2,fp1 /* x^3 */ + + fmadd fp4,fp2,fp13,fp12 /* S3+x^2*S4 */ + fmadd fp4,fp2,fp4,fp11 /* S2+x^2*(S3+x^2*S4) */ + fmadd fp4,fp2,fp4,fp10 /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */ + fmadd fp4,fp2,fp4,fp9 /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */ + fmadd fp4,fp3,fp4,fp1 /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */ + fmul fp4,fp4,fp0 /* Add in the sign. */ + frsp fp1,fp4 /* Round to single precision. */ + + blr + + .balign 16 +L(cos): + /* Chebyshev polynomial of the form: + 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */ + + lfd fp9,(L(C0)-L(anchor))(r9) + lfd fp10,(L(C1)-L(anchor))(r9) + lfd fp11,(L(C2)-L(anchor))(r9) + lfd fp12,(L(C3)-L(anchor))(r9) + lfd fp13,(L(C4)-L(anchor))(r9) + + fmul fp2,fp1,fp1 /* x^2 */ + lfd fp3,(L(DPone)-L(anchor))(r9) + + fmadd fp4,fp2,fp13,fp12 /* C3+x^2*C4 */ + fmadd fp4,fp2,fp4,fp11 /* C2+x^2*(C3+x^2*C4) */ + fmadd fp4,fp2,fp4,fp10 /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */ + fmadd fp4,fp2,fp4,fp9 /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */ + fmadd fp4,fp2,fp4,fp3 /* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */ + fmul fp4,fp4,fp0 /* Add in the sign. */ + frsp fp1,fp4 /* Round to single precision. */ + + blr + + .balign 16 +L(greater_or_equal_9pio4): + lis r4,INFINITY@h + ori r4,r4,INFINITY@l + cmpw r3,r4 + bge L(inf_or_nan) + + lis r4,TWO_P23@h + ori r4,r4,TWO_P23@l + cmpw r3,r4 + bge L(greater_or_equal_2p23) + + fabs fp1,fp1 /* |x| */ + + /* Calculate quotient of |x|/(PI/4). */ + lfd fp2,(L(invpio4)-L(anchor))(r9) + + lfd fp3,(L(DPone)-L(anchor))(r9) + lfd fp4,(L(DPhalf)-L(anchor))(r9) + fmul fp2,fp1,fp2 /* |x|/(PI/4) */ + friz fp2,fp2 /* n = floor(|x|/(PI/4)) */ + + /* Calculate (n + 1) / 2. */ + fadd fp2,fp2,fp3 /* n + 1 */ + fmul fp3,fp2,fp4 /* (n + 1) / 2 */ + friz fp3,fp3 + + lfd fp4,(L(pio2hi)-L(anchor))(r9) + lfd fp5,(L(pio2lo)-L(anchor))(r9) + + fmul fp6,fp4,fp3 + fadd fp6,fp6,fp1 + fmadd fp1,fp5,fp3,fp6 + + fctiduz fp2,fp2 + mfvsrd r7,v2 /* n + 1 */ + + b L(reduced) + + .balign 16 +L(inf_or_nan): + bne L(skip_errno_setting) /* Is a NAN? */ + + /* We delayed the creation of the stack frame, as well as the saving of + the link register, because only at this point, we are sure that + doing so is actually needed. */ + + stfd fp1,-8(r1) + + /* Save the link register. */ + mflr r0 + std r0,16(r1) + cfi_offset(lr, 16) + + /* Create the stack frame. */ + stdu r1,-FRAMESIZE(r1) + cfi_adjust_cfa_offset(FRAMESIZE) + + bl JUMPTARGET(__errno_location) + nop + + /* Restore the stack frame. */ + addi r1,r1,FRAMESIZE + cfi_adjust_cfa_offset(-FRAMESIZE) + /* Restore the link register. */ + ld r0,16(r1) + mtlr r0 + + lfd fp1,-8(r1) + + /* errno = EDOM */ + li r4,EDOM + stw r4,0(r3) + +L(skip_errno_setting): + fsub fp1,fp1,fp1 /* x - x */ + blr + + .balign 16 +L(greater_or_equal_2p23): + fabs fp1,fp1 + + srwi r4,r3,FLOAT_EXPONENT_SHIFT + subi r4,r4,FLOAT_EXPONENT_BIAS + + /* We reduce the input modulo pi/4, so we need 3 bits of integer + to determine where in 2*pi we are. Index into our array + accordingly. */ + addi r4,r4,INTEGER_BITS + + /* To avoid an expensive divide, for the range we care about (0 - 127) + we can transform x/28 into: + + x/28 = (x * ((0x100000000 / 28) + 1)) >> 32 + + mulhwu returns the top 32 bits of the 64 bit result, doing the + shift for us in the same instruction. The top 32 bits are undefined, + so we have to mask them. */ + + lis r6,FX_FRACTION_1_28@h + ori r6,r6,FX_FRACTION_1_28@l + mulhwu r5,r4,r6 + clrldi r5,r5,32 + + /* Get our pointer into the invpio4_table array. */ + sldi r4,r5,3 + addi r6,r9,(L(invpio4_table)-L(anchor)) + add r4,r4,r6 + + lfd fp2,0(r4) + lfd fp3,8(r4) + lfd fp4,16(r4) + lfd fp5,24(r4) + + fmul fp6,fp2,fp1 + fmul fp7,fp3,fp1 + fmul fp8,fp4,fp1 + fmul fp9,fp5,fp1 + + /* Mask off larger integer bits in highest double word that we don't + care about to avoid losing precision when combining with smaller + values. */ + fctiduz fp10,fp6 + mfvsrd r7,v10 + rldicr r7,r7,0,(63-INTEGER_BITS) + mtvsrd v10,r7 + fcfidu fp10,fp10 /* Integer bits. */ + + fsub fp6,fp6,fp10 /* highest -= integer bits */ + + /* Work out the integer component, rounded down. Use the top two + limbs for this. */ + fadd fp10,fp6,fp7 /* highest + higher */ + + fctiduz fp10,fp10 + mfvsrd r7,v10 + andi. r0,r7,1 + fcfidu fp10,fp10 + + /* Subtract integer component from highest limb. */ + fsub fp12,fp6,fp10 + + beq L(even_integer) + + /* Our integer component is odd, so we are in the -PI/4 to 0 primary + region. We need to shift our result down by PI/4, and to do this + in the mod (4/PI) space we simply subtract 1. */ + lfd fp11,(L(DPone)-L(anchor))(r9) + fsub fp12,fp12,fp11 + + /* Now add up all the limbs in order. */ + fadd fp12,fp12,fp7 + fadd fp12,fp12,fp8 + fadd fp12,fp12,fp9 + + /* And finally multiply by pi/4. */ + lfd fp13,(L(pio4)-L(anchor))(r9) + fmul fp1,fp12,fp13 + + addi r7,r7,1 + b L(reduced) + +L(even_integer): + lfd fp11,(L(DPone)-L(anchor))(r9) + + /* Now add up all the limbs in order. */ + fadd fp12,fp12,fp7 + fadd fp12,r12,fp8 + fadd fp12,r12,fp9 + + /* We need to check if the addition of all the limbs resulted in us + overflowing 1.0. */ + fcmpu 0,fp12,fp11 + bgt L(greater_than_one) + + /* And finally multiply by pi/4. */ + lfd fp13,(L(pio4)-L(anchor))(r9) + fmul fp1,fp12,fp13 + + addi r7,r7,1 + b L(reduced) + +L(greater_than_one): + /* We did overflow 1.0 when adding up all the limbs. Add 1.0 to our + integer, and subtract 1.0 from our result. Since that makes the + integer component odd, we need to subtract another 1.0 as + explained above. */ + addi r7,r7,1 + + lfd fp11,(L(DPtwo)-L(anchor))(r9) + fsub fp12,fp12,fp11 + + /* And finally multiply by pi/4. */ + lfd fp13,(L(pio4)-L(anchor))(r9) + fmul fp1,fp12,fp13 + + addi r7,r7,1 + b L(reduced) + + .balign 16 +L(less_2pn5): + lis r4,TWO_PN27@h + ori r4,r4,TWO_PN27@l + + cmpw r3,r4 + blt L(less_2pn27) + + /* A simpler Chebyshev approximation is close enough for this range: + 1.0+x^2*(CC0+x^3*CC1). */ + + lfd fp10,(L(CC0)-L(anchor))(r9) + lfd fp11,(L(CC1)-L(anchor))(r9) + + fmul fp2,fp1,fp1 /* x^2 */ + fmul fp3,fp2,fp1 /* x^3 */ + lfd fp1,(L(DPone)-L(anchor))(r9) + + fmadd fp4,fp3,fp11,fp10 /* CC0+x^3*CC1 */ + fmadd fp1,fp2,fp4,fp1 /* 1.0+x^2*(CC0+x^3*CC1) */ + + frsp fp1,fp1 /* Round to single precision. */ + + blr + + .balign 16 +L(less_2pn27): + /* Handle some special cases: + + cosf(subnormal) raises inexact + cosf(min_normalized) raises inexact + cosf(normalized) raises inexact. */ + + lfd fp2,(L(DPone)-L(anchor))(r9) + + fabs fp1,fp1 /* |x| */ + fsub fp1,fp2,fp1 /* 1.0-|x| */ + + frsp fp1,fp1 + + blr + +END (__cosf) + + .section .rodata, "a" + + .balign 8 + +L(anchor): + + /* Chebyshev constants for sin, range -PI/4 - PI/4. */ +L(S0): .8byte 0xbfc5555555551cd9 +L(S1): .8byte 0x3f81111110c2688b +L(S2): .8byte 0xbf2a019f8b4bd1f9 +L(S3): .8byte 0x3ec71d7264e6b5b4 +L(S4): .8byte 0xbe5a947e1674b58a + + /* Chebyshev constants for cos, range 2^-27 - 2^-5. */ +L(CC0): .8byte 0xbfdfffffff5cc6fd +L(CC1): .8byte 0x3fa55514b178dac5 + + /* Chebyshev constants for cos, range -PI/4 - PI/4. */ +L(C0): .8byte 0xbfdffffffffe98ae +L(C1): .8byte 0x3fa55555545c50c7 +L(C2): .8byte 0xbf56c16b348b6874 +L(C3): .8byte 0x3efa00eb9ac43cc0 +L(C4): .8byte 0xbe923c97dd8844d7 + +L(invpio2): + .8byte 0x3fe45f306dc9c883 /* 2/PI */ + +L(invpio4): + .8byte 0x3ff45f306dc9c883 /* 4/PI */ + +L(invpio4_table): + .8byte 0x0000000000000000 + .8byte 0x3ff45f306c000000 + .8byte 0x3e3c9c882a000000 + .8byte 0x3c54fe13a8000000 + .8byte 0x3aaf47d4d0000000 + .8byte 0x38fbb81b6c000000 + .8byte 0x3714acc9e0000000 + .8byte 0x3560e4107c000000 + .8byte 0x33bca2c756000000 + .8byte 0x31fbd778ac000000 + .8byte 0x300b7246e0000000 + .8byte 0x2e5d2126e8000000 + .8byte 0x2c97003248000000 + .8byte 0x2ad77504e8000000 + .8byte 0x290921cfe0000000 + .8byte 0x274deb1cb0000000 + .8byte 0x25829a73e0000000 + .8byte 0x23fd1046be000000 + .8byte 0x2224baed10000000 + .8byte 0x20709d338e000000 + .8byte 0x1e535a2f80000000 + .8byte 0x1cef904e64000000 + .8byte 0x1b0d639830000000 + .8byte 0x1964ce7d24000000 + .8byte 0x17b908bf16000000 + +L(pio4): + .8byte 0x3fe921fb54442d18 /* PI/4 */ + +/* PI/2 as a sum of two doubles. We only use 32 bits of the upper limb + to avoid losing significant bits when multiplying with up to + (2^22)/(pi/2). */ +L(pio2hi): + .8byte 0xbff921fb54400000 + +L(pio2lo): + .8byte 0xbdd0b4611a626332 + +L(pio2_table): + .8byte 0 + .8byte 0x3ff921fb54442d18 /* 1 * PI/2 */ + .8byte 0x400921fb54442d18 /* 2 * PI/2 */ + .8byte 0x4012d97c7f3321d2 /* 3 * PI/2 */ + .8byte 0x401921fb54442d18 /* 4 * PI/2 */ + .8byte 0x401f6a7a2955385e /* 5 * PI/2 */ + .8byte 0x4022d97c7f3321d2 /* 6 * PI/2 */ + .8byte 0x4025fdbbe9bba775 /* 7 * PI/2 */ + .8byte 0x402921fb54442d18 /* 8 * PI/2 */ + .8byte 0x402c463abeccb2bb /* 9 * PI/2 */ + .8byte 0x402f6a7a2955385e /* 10 * PI/2 */ + +L(small): + .8byte 0x3cd0000000000000 /* 2^-50 */ + +L(ones): + .8byte 0x3ff0000000000000 /* +1.0 */ + .8byte 0xbff0000000000000 /* -1.0 */ + +L(DPhalf): + .8byte 0x3fe0000000000000 /* 0.5 */ + +L(DPone): + .8byte 0x3ff0000000000000 /* 1.0 */ + +L(DPtwo): + .8byte 0x4000000000000000 /* 2.0 */ + +weak_alias(__cosf, cosf) diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S new file mode 100644 index 0000000000..fcdcb60293 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S @@ -0,0 +1,56 @@ +/* isfinite(). PowerPC64/POWER8 version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <math_ldbl_opt.h> + +#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */ + +/* int [r3] __finite ([fp1] x) */ + +EALIGN (__finite, 4, 0) + CALL_MCOUNT 0 + MFVSRD_R3_V1 + lis r9,0x8010 + clrldi r3,r3,1 /* r3 = r3 & 0x8000000000000000 */ + rldicr r9,r9,32,31 /* r9 = (r9 << 32) & 0xffffffff */ + add r3,r3,r9 + rldicl r3,r3,1,63 + blr +END (__finite) + +hidden_def (__finite) +weak_alias (__finite, finite) + +/* It turns out that the 'double' version will also always work for + single-precision. */ +strong_alias (__finite, __finitef) +hidden_def (__finitef) +weak_alias (__finitef, finitef) + +#if IS_IN (libm) +# if LONG_DOUBLE_COMPAT (libm, GLIBC_2_0) +compat_symbol (libm, __finite, __finitel, GLIBC_2_0) +compat_symbol (libm, finite, finitel, GLIBC_2_0) +# endif +#else +# if LONG_DOUBLE_COMPAT (libc, GLIBC_2_0) +compat_symbol (libc, __finite, __finitel, GLIBC_2_0); +compat_symbol (libc, finite, finitel, GLIBC_2_0); +# endif +#endif diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_finitef.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_finitef.S new file mode 100644 index 0000000000..54bd94176d --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_finitef.S @@ -0,0 +1 @@ +/* This function uses the same code as s_finite.S. */ diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S new file mode 100644 index 0000000000..32814e4525 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S @@ -0,0 +1,61 @@ +/* isinf(). PowerPC64/POWER8 version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <math_ldbl_opt.h> + +#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */ + +/* int [r3] __isinf([fp1] x) */ + +EALIGN (__isinf, 4, 0) + CALL_MCOUNT 0 + MFVSRD_R3_V1 + lis r9,0x7ff0 /* r9 = 0x7ff0 */ + rldicl r10,r3,0,1 /* r10 = r3 & (0x8000000000000000) */ + sldi r9,r9,32 /* r9 = r9 << 52 */ + cmpd cr7,r10,r9 /* fp1 & 0x7ff0000000000000 ? */ + beq cr7,L(inf) + li r3,0 /* Not inf */ + blr +L(inf): + sradi r3,r3,63 /* r3 = r3 >> 63 */ + ori r3,r3,1 /* r3 = r3 | 0x1 */ + blr +END (__isinf) + +hidden_def (__isinf) +weak_alias (__isinf, isinf) + +/* It turns out that the 'double' version will also always work for + single-precision. */ +strong_alias (__isinf, __isinff) +hidden_def (__isinff) +weak_alias (__isinff, isinff) + +#ifdef NO_LONG_DOUBLE +strong_alias (__isinf, __isinfl) +weak_alias (__isinf, isinfl) +#endif + +#if !IS_IN (libm) +# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0) +compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0); +compat_symbol (libc, isinf, isinfl, GLIBC_2_0); +# endif +#endif diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isinff.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isinff.S new file mode 100644 index 0000000000..be759e091e --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isinff.S @@ -0,0 +1 @@ +/* This function uses the same code as s_isinf.S. */ diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S new file mode 100644 index 0000000000..af52e502b7 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S @@ -0,0 +1,56 @@ +/* isnan(). PowerPC64/POWER8 version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <math_ldbl_opt.h> + +#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */ + +/* int [r3] __isnan([f1] x) */ + +EALIGN (__isnan, 4, 0) + CALL_MCOUNT 0 + MFVSRD_R3_V1 + lis r9,0x7ff0 + clrldi r3,r3,1 /* r3 = r3 & 0x8000000000000000 */ + rldicr r9,r9,32,31 /* r9 = (r9 << 32) & 0xffffffff */ + subf r3,r3,r9 + rldicl r3,r3,1,63 + blr +END (__isnan) + +hidden_def (__isnan) +weak_alias (__isnan, isnan) + +/* It turns out that the 'double' version will also always work for + single-precision. */ +strong_alias (__isnan, __isnanf) +hidden_def (__isnanf) +weak_alias (__isnanf, isnanf) + +#ifdef NO_LONG_DOUBLE +strong_alias (__isnan, __isnanl) +weak_alias (__isnan, isnanl) +#endif + +#if !IS_IN (libm) +# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0) +compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0); +compat_symbol (libc, isnan, isnanl, GLIBC_2_0); +# endif +#endif diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isnanf.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isnanf.S new file mode 100644 index 0000000000..b48c85e0d3 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_isnanf.S @@ -0,0 +1 @@ +/* This function uses the same code as s_isnan.S. */ diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S new file mode 100644 index 0000000000..aa180b6901 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S @@ -0,0 +1,45 @@ +/* Round double to long int. POWER8 PowerPC64 version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <math_ldbl_opt.h> + +#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */ + +/* long long int[r3] __llrint (double x[fp1]) */ +ENTRY (__llrint) + CALL_MCOUNT 0 + fctid fp1,fp1 + MFVSRD_R3_V1 + blr +END (__llrint) + +strong_alias (__llrint, __lrint) +weak_alias (__llrint, llrint) +weak_alias (__lrint, lrint) + +#ifdef NO_LONG_DOUBLE +strong_alias (__llrint, __llrintl) +weak_alias (__llrint, llrintl) +strong_alias (__lrint, __lrintl) +weak_alias (__lrint, lrintl) +#endif +#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_1) +compat_symbol (libm, __llrint, llrintl, GLIBC_2_1) +compat_symbol (libm, __lrint, lrintl, GLIBC_2_1) +#endif diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S new file mode 100644 index 0000000000..043fc6a089 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S @@ -0,0 +1,48 @@ +/* llround function. POWER8 PowerPC64 version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <endian.h> +#include <math_ldbl_opt.h> + +#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */ + +/* long long [r3] llround (float x [fp1]) */ + +ENTRY (__llround) + CALL_MCOUNT 0 + frin fp1,fp1 /* Round to nearest +-0.5. */ + fctidz fp1,fp1 /* Convert To Integer DW round toward 0. */ + MFVSRD_R3_V1 + blr +END (__llround) + +strong_alias (__llround, __lround) +weak_alias (__llround, llround) +weak_alias (__lround, lround) + +#ifdef NO_LONG_DOUBLE +weak_alias (__llround, llroundl) +strong_alias (__llround, __llroundl) +weak_alias (__lround, lroundl) +strong_alias (__lround, __lroundl) +#endif +#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_1) +compat_symbol (libm, __llround, llroundl, GLIBC_2_1) +compat_symbol (libm, __lround, lroundl, GLIBC_2_1) +#endif diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S new file mode 100644 index 0000000000..fb0add3462 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S @@ -0,0 +1,519 @@ +/* Optimized sinf(). PowerPC64/POWER8 version. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#define _ERRNO_H 1 +#include <bits/errno.h> + +#define FRAMESIZE (FRAME_MIN_SIZE+16) + +#define FLOAT_EXPONENT_SHIFT 23 +#define FLOAT_EXPONENT_BIAS 127 +#define INTEGER_BITS 3 + +#define PI_4 0x3f490fdb /* PI/4 */ +#define NINEPI_4 0x40e231d6 /* 9 * PI/4 */ +#define TWO_PN5 0x3d000000 /* 2^-5 */ +#define TWO_PN27 0x32000000 /* 2^-27 */ +#define INFINITY 0x7f800000 +#define TWO_P23 0x4b000000 /* 2^27 */ +#define FX_FRACTION_1_28 0x9249250 /* 0x100000000 / 28 + 1 */ + + /* Implements the function + + float [fp1] sinf (float [fp1] x) */ + + .machine power8 +EALIGN(__sinf, 4, 0) + addis r9,r2,L(anchor)@toc@ha + addi r9,r9,L(anchor)@toc@l + + lis r4,PI_4@h + ori r4,r4,PI_4@l + + xscvdpspn v0,v1 + mfvsrd r8,v0 + rldicl r3,r8,32,33 /* Remove sign bit. */ + + cmpw r3,r4 + bge L(greater_or_equal_pio4) + + lis r4,TWO_PN5@h + ori r4,r4,TWO_PN5@l + + cmpw r3,r4 + blt L(less_2pn5) + + /* Chebyshev polynomial of the form: + * x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */ + + lfd fp9,(L(S0)-L(anchor))(r9) + lfd fp10,(L(S1)-L(anchor))(r9) + lfd fp11,(L(S2)-L(anchor))(r9) + lfd fp12,(L(S3)-L(anchor))(r9) + lfd fp13,(L(S4)-L(anchor))(r9) + + fmul fp2,fp1,fp1 /* x^2 */ + fmul fp3,fp2,fp1 /* x^3 */ + + fmadd fp4,fp2,fp13,fp12 /* S3+x^2*S4 */ + fmadd fp4,fp2,fp4,fp11 /* S2+x^2*(S3+x^2*S4) */ + fmadd fp4,fp2,fp4,fp10 /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */ + fmadd fp4,fp2,fp4,fp9 /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */ + fmadd fp1,fp3,fp4,fp1 /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */ + frsp fp1,fp1 /* Round to single precision. */ + + blr + + .balign 16 +L(greater_or_equal_pio4): + lis r4,NINEPI_4@h + ori r4,r4,NINEPI_4@l + cmpw r3,r4 + bge L(greater_or_equal_9pio4) + + /* Calculate quotient of |x|/(PI/4). */ + lfd fp2,(L(invpio4)-L(anchor))(r9) + fabs fp1,fp1 /* |x| */ + fmul fp2,fp1,fp2 /* |x|/(PI/4) */ + fctiduz fp2,fp2 + mfvsrd r3,v2 /* n = |x| mod PI/4 */ + + /* Now use that quotient to find |x| mod (PI/2). */ + addi r7,r3,1 + rldicr r5,r7,2,60 /* ((n+1) >> 1) << 3 */ + addi r6,r9,(L(pio2_table)-L(anchor)) + lfdx fp4,r5,r6 + fsub fp1,fp1,fp4 + + .balign 16 +L(reduced): + /* Now we are in the range -PI/4 to PI/4. */ + + /* Work out if we are in a positive or negative primary interval. */ + rldicl r4,r7,62,63 /* ((n+1) >> 2) & 1 */ + + /* We are operating on |x|, so we need to add back the original + sign. */ + rldicl r8,r8,33,63 /* (x >> 31) & 1, ie the sign bit. */ + xor r4,r4,r8 /* 0 if result should be positive, + 1 if negative. */ + + /* Load a 1.0 or -1.0. */ + addi r5,r9,(L(ones)-L(anchor)) + sldi r4,r4,3 + lfdx fp0,r4,r5 + + /* Are we in the primary interval of sin or cos? */ + andi. r4,r7,0x2 + bne L(cos) + + /* Chebyshev polynomial of the form: + x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */ + + lfd fp9,(L(S0)-L(anchor))(r9) + lfd fp10,(L(S1)-L(anchor))(r9) + lfd fp11,(L(S2)-L(anchor))(r9) + lfd fp12,(L(S3)-L(anchor))(r9) + lfd fp13,(L(S4)-L(anchor))(r9) + + fmul fp2,fp1,fp1 /* x^2 */ + fmul fp3,fp2,fp1 /* x^3 */ + + fmadd fp4,fp2,fp13,fp12 /* S3+x^2*S4 */ + fmadd fp4,fp2,fp4,fp11 /* S2+x^2*(S3+x^2*S4) */ + fmadd fp4,fp2,fp4,fp10 /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */ + fmadd fp4,fp2,fp4,fp9 /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */ + fmadd fp4,fp3,fp4,fp1 /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */ + fmul fp4,fp4,fp0 /* Add in the sign. */ + frsp fp1,fp4 /* Round to single precision. */ + + blr + + .balign 16 +L(cos): + /* Chebyshev polynomial of the form: + 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */ + + lfd fp9,(L(C0)-L(anchor))(r9) + lfd fp10,(L(C1)-L(anchor))(r9) + lfd fp11,(L(C2)-L(anchor))(r9) + lfd fp12,(L(C3)-L(anchor))(r9) + lfd fp13,(L(C4)-L(anchor))(r9) + + fmul fp2,fp1,fp1 /* x^2 */ + lfd fp3,(L(DPone)-L(anchor))(r9) + + fmadd fp4,fp2,fp13,fp12 /* C3+x^2*C4 */ + fmadd fp4,fp2,fp4,fp11 /* C2+x^2*(C3+x^2*C4) */ + fmadd fp4,fp2,fp4,fp10 /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */ + fmadd fp4,fp2,fp4,fp9 /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */ + fmadd fp4,fp2,fp4,fp3 /* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */ + fmul fp4,fp4,fp0 /* Add in the sign. */ + frsp fp1,fp4 /* Round to single precision. */ + + blr + + .balign 16 +L(greater_or_equal_9pio4): + lis r4,INFINITY@h + ori r4,r4,INFINITY@l + cmpw r3,r4 + bge L(inf_or_nan) + + lis r4,TWO_P23@h + ori r4,r4,TWO_P23@l + cmpw r3,r4 + bge L(greater_or_equal_2p23) + + fabs fp1,fp1 /* |x| */ + + /* Calculate quotient of |x|/(PI/4). */ + lfd fp2,(L(invpio4)-L(anchor))(r9) + + lfd fp3,(L(DPone)-L(anchor))(r9) + lfd fp4,(L(DPhalf)-L(anchor))(r9) + fmul fp2,fp1,fp2 /* |x|/(PI/4) */ + friz fp2,fp2 /* n = floor(|x|/(PI/4)) */ + + /* Calculate (n + 1) / 2. */ + fadd fp2,fp2,fp3 /* n + 1 */ + fmul fp3,fp2,fp4 /* (n + 1) / 2 */ + friz fp3,fp3 + + lfd fp4,(L(pio2hi)-L(anchor))(r9) + lfd fp5,(L(pio2lo)-L(anchor))(r9) + + fmul fp6,fp4,fp3 + fadd fp6,fp6,fp1 + fmadd fp1,fp5,fp3,fp6 + + fctiduz fp2,fp2 + mfvsrd r7,v2 /* n + 1 */ + + b L(reduced) + + .balign 16 +L(inf_or_nan): + bne L(skip_errno_setting) /* Is a NAN? */ + + /* We delayed the creation of the stack frame, as well as the saving of + the link register, because only at this point, we are sure that + doing so is actually needed. */ + + stfd fp1,-8(r1) + + /* Save the link register. */ + mflr r0 + std r0,16(r1) + cfi_offset(lr, 16) + + /* Create the stack frame. */ + stdu r1,-FRAMESIZE(r1) + cfi_adjust_cfa_offset(FRAMESIZE) + + bl JUMPTARGET(__errno_location) + nop + + /* Restore the stack frame. */ + addi r1,r1,FRAMESIZE + cfi_adjust_cfa_offset(-FRAMESIZE) + /* Restore the link register. */ + ld r0,16(r1) + mtlr r0 + + lfd fp1,-8(r1) + + /* errno = EDOM */ + li r4,EDOM + stw r4,0(r3) + +L(skip_errno_setting): + fsub fp1,fp1,fp1 /* x - x */ + blr + + .balign 16 +L(greater_or_equal_2p23): + fabs fp1,fp1 + + srwi r4,r3,FLOAT_EXPONENT_SHIFT + subi r4,r4,FLOAT_EXPONENT_BIAS + + /* We reduce the input modulo pi/4, so we need 3 bits of integer + to determine where in 2*pi we are. Index into our array + accordingly. */ + addi r4,r4,INTEGER_BITS + + /* To avoid an expensive divide, for the range we care about (0 - 127) + we can transform x/28 into: + + x/28 = (x * ((0x100000000 / 28) + 1)) >> 32 + + mulhwu returns the top 32 bits of the 64 bit result, doing the + shift for us in the same instruction. The top 32 bits are undefined, + so we have to mask them. */ + + lis r6,FX_FRACTION_1_28@h + ori r6,r6,FX_FRACTION_1_28@l + mulhwu r5,r4,r6 + clrldi r5,r5,32 + + /* Get our pointer into the invpio4_table array. */ + sldi r4,r5,3 + addi r6,r9,(L(invpio4_table)-L(anchor)) + add r4,r4,r6 + + lfd fp2,0(r4) + lfd fp3,8(r4) + lfd fp4,16(r4) + lfd fp5,24(r4) + + fmul fp6,fp2,fp1 + fmul fp7,fp3,fp1 + fmul fp8,fp4,fp1 + fmul fp9,fp5,fp1 + + /* Mask off larger integer bits in highest double word that we don't + care about to avoid losing precision when combining with smaller + values. */ + fctiduz fp10,fp6 + mfvsrd r7,v10 + rldicr r7,r7,0,(63-INTEGER_BITS) + mtvsrd v10,r7 + fcfidu fp10,fp10 /* Integer bits. */ + + fsub fp6,fp6,fp10 /* highest -= integer bits */ + + /* Work out the integer component, rounded down. Use the top two + limbs for this. */ + fadd fp10,fp6,fp7 /* highest + higher */ + + fctiduz fp10,fp10 + mfvsrd r7,v10 + andi. r0,r7,1 + fcfidu fp10,fp10 + + /* Subtract integer component from highest limb. */ + fsub fp12,fp6,fp10 + + beq L(even_integer) + + /* Our integer component is odd, so we are in the -PI/4 to 0 primary + region. We need to shift our result down by PI/4, and to do this + in the mod (4/PI) space we simply subtract 1. */ + lfd fp11,(L(DPone)-L(anchor))(r9) + fsub fp12,fp12,fp11 + + /* Now add up all the limbs in order. */ + fadd fp12,fp12,fp7 + fadd fp12,fp12,fp8 + fadd fp12,fp12,fp9 + + /* And finally multiply by pi/4. */ + lfd fp13,(L(pio4)-L(anchor))(r9) + fmul fp1,fp12,fp13 + + addi r7,r7,1 + b L(reduced) + +L(even_integer): + lfd fp11,(L(DPone)-L(anchor))(r9) + + /* Now add up all the limbs in order. */ + fadd fp12,fp12,fp7 + fadd fp12,r12,fp8 + fadd fp12,r12,fp9 + + /* We need to check if the addition of all the limbs resulted in us + overflowing 1.0. */ + fcmpu 0,fp12,fp11 + bgt L(greater_than_one) + + /* And finally multiply by pi/4. */ + lfd fp13,(L(pio4)-L(anchor))(r9) + fmul fp1,fp12,fp13 + + addi r7,r7,1 + b L(reduced) + +L(greater_than_one): + /* We did overflow 1.0 when adding up all the limbs. Add 1.0 to our + integer, and subtract 1.0 from our result. Since that makes the + integer component odd, we need to subtract another 1.0 as + explained above. */ + addi r7,r7,1 + + lfd fp11,(L(DPtwo)-L(anchor))(r9) + fsub fp12,fp12,fp11 + + /* And finally multiply by pi/4. */ + lfd fp13,(L(pio4)-L(anchor))(r9) + fmul fp1,fp12,fp13 + + addi r7,r7,1 + b L(reduced) + + .balign 16 +L(less_2pn5): + lis r4,TWO_PN27@h + ori r4,r4,TWO_PN27@l + + cmpw r3,r4 + blt L(less_2pn27) + + /* A simpler Chebyshev approximation is close enough for this range: + x+x^3*(SS0+x^2*SS1). */ + + lfd fp10,(L(SS0)-L(anchor))(r9) + lfd fp11,(L(SS1)-L(anchor))(r9) + + fmul fp2,fp1,fp1 /* x^2 */ + fmul fp3,fp2,fp1 /* x^3 */ + + fmadd fp4,fp2,fp11,fp10 /* SS0+x^2*SS1 */ + fmadd fp1,fp3,fp4,fp1 /* x+x^3*(SS0+x^2*SS1) */ + + frsp fp1,fp1 /* Round to single precision. */ + + blr + + .balign 16 +L(less_2pn27): + cmpwi r3,0 + beq L(zero) + + /* Handle some special cases: + + sinf(subnormal) raises inexact/underflow + sinf(min_normalized) raises inexact/underflow + sinf(normalized) raises inexact. */ + + lfd fp2,(L(small)-L(anchor))(r9) + + fmul fp2,fp1,fp2 /* x * small */ + fsub fp1,fp1,fp2 /* x - x * small */ + + frsp fp1,fp1 + + blr + + .balign 16 +L(zero): + blr + +END (__sinf) + + .section .rodata, "a" + + .balign 8 + +L(anchor): + + /* Chebyshev constants for sin, range -PI/4 - PI/4. */ +L(S0): .8byte 0xbfc5555555551cd9 +L(S1): .8byte 0x3f81111110c2688b +L(S2): .8byte 0xbf2a019f8b4bd1f9 +L(S3): .8byte 0x3ec71d7264e6b5b4 +L(S4): .8byte 0xbe5a947e1674b58a + + /* Chebyshev constants for sin, range 2^-27 - 2^-5. */ +L(SS0): .8byte 0xbfc555555543d49d +L(SS1): .8byte 0x3f8110f475cec8c5 + + /* Chebyshev constants for cos, range -PI/4 - PI/4. */ +L(C0): .8byte 0xbfdffffffffe98ae +L(C1): .8byte 0x3fa55555545c50c7 +L(C2): .8byte 0xbf56c16b348b6874 +L(C3): .8byte 0x3efa00eb9ac43cc0 +L(C4): .8byte 0xbe923c97dd8844d7 + +L(invpio2): + .8byte 0x3fe45f306dc9c883 /* 2/PI */ + +L(invpio4): + .8byte 0x3ff45f306dc9c883 /* 4/PI */ + +L(invpio4_table): + .8byte 0x0000000000000000 + .8byte 0x3ff45f306c000000 + .8byte 0x3e3c9c882a000000 + .8byte 0x3c54fe13a8000000 + .8byte 0x3aaf47d4d0000000 + .8byte 0x38fbb81b6c000000 + .8byte 0x3714acc9e0000000 + .8byte 0x3560e4107c000000 + .8byte 0x33bca2c756000000 + .8byte 0x31fbd778ac000000 + .8byte 0x300b7246e0000000 + .8byte 0x2e5d2126e8000000 + .8byte 0x2c97003248000000 + .8byte 0x2ad77504e8000000 + .8byte 0x290921cfe0000000 + .8byte 0x274deb1cb0000000 + .8byte 0x25829a73e0000000 + .8byte 0x23fd1046be000000 + .8byte 0x2224baed10000000 + .8byte 0x20709d338e000000 + .8byte 0x1e535a2f80000000 + .8byte 0x1cef904e64000000 + .8byte 0x1b0d639830000000 + .8byte 0x1964ce7d24000000 + .8byte 0x17b908bf16000000 + +L(pio4): + .8byte 0x3fe921fb54442d18 /* PI/4 */ + +/* PI/2 as a sum of two doubles. We only use 32 bits of the upper limb + to avoid losing significant bits when multiplying with up to + (2^22)/(pi/2). */ +L(pio2hi): + .8byte 0xbff921fb54400000 + +L(pio2lo): + .8byte 0xbdd0b4611a626332 + +L(pio2_table): + .8byte 0 + .8byte 0x3ff921fb54442d18 /* 1 * PI/2 */ + .8byte 0x400921fb54442d18 /* 2 * PI/2 */ + .8byte 0x4012d97c7f3321d2 /* 3 * PI/2 */ + .8byte 0x401921fb54442d18 /* 4 * PI/2 */ + .8byte 0x401f6a7a2955385e /* 5 * PI/2 */ + .8byte 0x4022d97c7f3321d2 /* 6 * PI/2 */ + .8byte 0x4025fdbbe9bba775 /* 7 * PI/2 */ + .8byte 0x402921fb54442d18 /* 8 * PI/2 */ + .8byte 0x402c463abeccb2bb /* 9 * PI/2 */ + .8byte 0x402f6a7a2955385e /* 10 * PI/2 */ + +L(small): + .8byte 0x3cd0000000000000 /* 2^-50 */ + +L(ones): + .8byte 0x3ff0000000000000 /* +1.0 */ + .8byte 0xbff0000000000000 /* -1.0 */ + +L(DPhalf): + .8byte 0x3fe0000000000000 /* 0.5 */ + +L(DPone): + .8byte 0x3ff0000000000000 /* 1.0 */ + +L(DPtwo): + .8byte 0x4000000000000000 /* 2.0 */ + +weak_alias(__sinf, sinf) diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memcmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memcmp.S new file mode 100644 index 0000000000..46b9c0067a --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memcmp.S @@ -0,0 +1,1447 @@ +/* Optimized memcmp implementation for POWER7/PowerPC64. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* int [r3] memcmp (const char *s1 [r3], + const char *s2 [r4], + size_t size [r5]) */ + +/* TODO: change these to the actual instructions when the minimum required + binutils allows it. */ +#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) +#ifndef MEMCMP +# define MEMCMP memcmp +#endif + .machine power7 +EALIGN (MEMCMP, 4, 0) + CALL_MCOUNT 3 + +#define rRTN r3 +#define rSTR1 r3 /* First string arg. */ +#define rSTR2 r4 /* Second string arg. */ +#define rN r5 /* Max string length. */ +#define rWORD1 r6 /* Current word in s1. */ +#define rWORD2 r7 /* Current word in s2. */ +#define rWORD3 r8 /* Next word in s1. */ +#define rWORD4 r9 /* Next word in s2. */ +#define rWORD5 r10 /* Next word in s1. */ +#define rWORD6 r11 /* Next word in s2. */ + +#define rOFF8 r20 /* 8 bytes offset. */ +#define rOFF16 r21 /* 16 bytes offset. */ +#define rOFF24 r22 /* 24 bytes offset. */ +#define rOFF32 r23 /* 24 bytes offset. */ +#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */ +#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */ +#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */ +#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */ +#define rSHR r28 /* Unaligned shift right count. */ +#define rSHL r29 /* Unaligned shift left count. */ +#define rWORD7 r30 /* Next word in s1. */ +#define rWORD8 r31 /* Next word in s2. */ + +#define rWORD8SAVE (-8) +#define rWORD7SAVE (-16) +#define rOFF8SAVE (-24) +#define rOFF16SAVE (-32) +#define rOFF24SAVE (-40) +#define rOFF32SAVE (-48) +#define rSHRSAVE (-56) +#define rSHLSAVE (-64) +#define rWORD8SHIFTSAVE (-72) +#define rWORD2SHIFTSAVE (-80) +#define rWORD4SHIFTSAVE (-88) +#define rWORD6SHIFTSAVE (-96) + +#ifdef __LITTLE_ENDIAN__ +# define LD ldbrx +#else +# define LD ldx +#endif + + xor r10, rSTR2, rSTR1 + cmpldi cr6, rN, 0 + cmpldi cr1, rN, 8 + clrldi. r0, r10, 61 + clrldi r12, rSTR1, 61 + cmpldi cr5, r12, 0 + beq- cr6, L(zeroLength) + dcbt 0, rSTR1 + dcbt 0, rSTR2 + /* If less than 8 bytes or not aligned, use the unaligned + byte loop. */ + blt cr1, L(bytealigned) + bne L(unalignedqw) +/* At this point we know both strings have the same alignment and the + compare length is at least 8 bytes. r12 contains the low order + 3 bits of rSTR1 and cr5 contains the result of the logical compare + of r12 to 0. If r12 == 0 then we are already double word + aligned and can perform the DW aligned loop. */ + + .align 4 +L(samealignment): + or r11, rSTR2, rSTR1 + clrldi. r11, r11, 60 + beq L(qw_align) + /* Try to align to QW else proceed to DW loop. */ + clrldi. r10, r10, 60 + bne L(DW) + /* For the difference to reach QW alignment, load as DW. */ + clrrdi rSTR1, rSTR1, 3 + clrrdi rSTR2, rSTR2, 3 + subfic r10, r12, 8 + LD rWORD1, 0, rSTR1 + LD rWORD2, 0, rSTR2 + sldi r9, r10, 3 + subfic r9, r9, 64 + sld rWORD1, rWORD1, r9 + sld rWORD2, rWORD2, r9 + cmpld cr6, rWORD1, rWORD2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 + bne cr6, L(ret_diff) + subf rN, r10, rN + + cmpld cr6, r11, r12 + bgt cr6, L(qw_align) + LD rWORD1, 0, rSTR1 + LD rWORD2, 0, rSTR2 + cmpld cr6, rWORD1, rWORD2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 + bne cr6, L(different) + cmpldi cr6, rN, 8 + ble cr6, L(zeroLength) + addi rN, rN, -8 + /* Now both rSTR1 and rSTR2 are aligned to QW. */ + .align 4 +L(qw_align): + vspltisb v0, 0 + srdi. r6, rN, 6 + li r8, 16 + li r10, 32 + li r11, 48 + ble cr0, L(lessthan64) + mtctr r6 + vspltisb v8, 0 + vspltisb v6, 0 + /* Aligned vector loop. */ + .align 4 +L(aligned_loop): + lvx v4, 0, rSTR1 + lvx v5, 0, rSTR2 + vcmpequb. v7, v6, v8 + bnl cr6, L(different3) + lvx v6, rSTR1, r8 + lvx v8, rSTR2, r8 + vcmpequb. v7, v5, v4 + bnl cr6, L(different2) + lvx v4, rSTR1, r10 + lvx v5, rSTR2, r10 + vcmpequb. v7, v6, v8 + bnl cr6, L(different3) + lvx v6, rSTR1, r11 + lvx v8, rSTR2, r11 + vcmpequb. v7, v5, v4 + bnl cr6, L(different2) + addi rSTR1, rSTR1, 64 + addi rSTR2, rSTR2, 64 + bdnz L(aligned_loop) + vcmpequb. v7, v6, v8 + bnl cr6, L(different3) + clrldi rN, rN, 58 + /* Handle remainder for aligned loop. */ + .align 4 +L(lessthan64): + mr r9, rSTR1 + cmpdi cr6, rN, 0 + li rSTR1, 0 + blelr cr6 + lvx v4, 0, r9 + lvx v5, 0, rSTR2 + vcmpequb. v7, v5, v4 + bnl cr6, L(different1) + addi rN, rN, -16 + + cmpdi cr6, rN, 0 + blelr cr6 + lvx v4, r9, r8 + lvx v5, rSTR2, r8 + vcmpequb. v7, v5, v4 + bnl cr6, L(different1) + addi rN, rN, -16 + + cmpdi cr6, rN, 0 + blelr cr6 + lvx v4, r9, r10 + lvx v5, rSTR2, r10 + vcmpequb. v7, v5, v4 + bnl cr6, L(different1) + addi rN, rN, -16 + + cmpdi cr6, rN, 0 + blelr cr6 + lvx v4, r9, r11 + lvx v5, rSTR2, r11 + vcmpequb. v7, v5, v4 + bnl cr6, L(different1) + blr + + /* Calculate and return the difference. */ + .align 4 +L(different1): + cmpdi cr6, rN, 16 + bge cr6, L(different2) + /* Discard unwanted bytes. */ +#ifdef __LITTLE_ENDIAN__ + lvsr v1, 0, rN + vperm v4, v4, v0, v1 + vperm v5, v5, v0, v1 +#else + lvsl v1, 0, rN + vperm v4, v0, v4, v1 + vperm v5, v0, v5, v1 +#endif + vcmpequb. v7, v4, v5 + li rRTN, 0 + bltlr cr6 + .align 4 +L(different2): +#ifdef __LITTLE_ENDIAN__ + /* Reverse bytes for direct comparison. */ + lvsl v10, r0, r0 + vspltisb v8, 15 + vsububm v9, v8, v10 + vperm v4, v4, v0, v9 + vperm v5, v5, v0, v9 +#endif + MFVRD(r7, v4) + MFVRD(r9, v5) + cmpld cr6, r7, r9 + bne cr6, L(ret_diff) + /* Difference in second DW. */ + vsldoi v4, v4, v4, 8 + vsldoi v5, v5, v5, 8 + MFVRD(r7, v4) + MFVRD(r9, v5) + cmpld cr6, r7, r9 +L(ret_diff): + li rRTN, 1 + bgtlr cr6 + li rRTN, -1 + blr + .align 4 +L(different3): +#ifdef __LITTLE_ENDIAN__ + /* Reverse bytes for direct comparison. */ + vspltisb v9, 15 + lvsl v10, r0, r0 + vsububm v9, v9, v10 + vperm v6, v6, v0, v9 + vperm v8, v8, v0, v9 +#endif + MFVRD(r7, v6) + MFVRD(r9, v8) + cmpld cr6, r7, r9 + bne cr6, L(ret_diff) + /* Difference in second DW. */ + vsldoi v6, v6, v6, 8 + vsldoi v8, v8, v8, 8 + MFVRD(r7, v6) + MFVRD(r9, v8) + cmpld cr6, r7, r9 + li rRTN, 1 + bgtlr cr6 + li rRTN, -1 + blr + + .align 4 +L(different): + cmpldi cr7, rN, 8 + bgt cr7, L(end) + /* Skip unwanted bytes. */ + sldi r8, rN, 3 + subfic r8, r8, 64 + srd rWORD1, rWORD1, r8 + srd rWORD2, rWORD2, r8 + cmpld cr6, rWORD1, rWORD2 + li rRTN, 0 + beqlr cr6 +L(end): + li rRTN, 1 + bgtlr cr6 + li rRTN, -1 + blr + + .align 4 +L(unalignedqw): + /* Proceed to DW unaligned loop,if there is a chance of pagecross. */ + rldicl r9, rSTR1, 0, 52 + add r9, r9, rN + cmpldi cr0, r9, 4096-16 + bgt cr0, L(unaligned) + rldicl r9, rSTR2, 0, 52 + add r9, r9, rN + cmpldi cr0, r9, 4096-16 + bgt cr0, L(unaligned) + li r0, 0 + li r8, 16 + vspltisb v0, 0 + /* Check if rSTR1 is aligned to QW. */ + andi. r11, rSTR1, 0xF + beq L(s1_align) + + /* Compare 16B and align S1 to QW. */ +#ifdef __LITTLE_ENDIAN__ + lvsr v10, 0, rSTR1 /* Compute mask. */ + lvsr v6, 0, rSTR2 /* Compute mask. */ +#else + lvsl v10, 0, rSTR1 /* Compute mask. */ + lvsl v6, 0, rSTR2 /* Compute mask. */ +#endif + lvx v5, 0, rSTR2 + lvx v9, rSTR2, r8 +#ifdef __LITTLE_ENDIAN__ + vperm v5, v9, v5, v6 +#else + vperm v5, v5, v9, v6 +#endif + lvx v4, 0, rSTR1 + lvx v9, rSTR1, r8 +#ifdef __LITTLE_ENDIAN__ + vperm v4, v9, v4, v10 +#else + vperm v4, v4, v9, v10 +#endif + vcmpequb. v7, v5, v4 + bnl cr6, L(different1) + cmpldi cr6, rN, 16 + ble cr6, L(zeroLength) + subfic r11, r11, 16 + subf rN, r11, rN + add rSTR1, rSTR1, r11 + add rSTR2, rSTR2, r11 + + /* As s1 is QW aligned prepare for unaligned loop. */ + .align 4 +L(s1_align): +#ifdef __LITTLE_ENDIAN__ + lvsr v6, 0, rSTR2 +#else + lvsl v6, 0, rSTR2 +#endif + lvx v5, 0, rSTR2 + srdi. r6, rN, 6 + li r10, 32 + li r11, 48 + ble cr0, L(lessthan64_unalign) + mtctr r6 + li r9, 64 + /* Unaligned vector loop. */ + .align 4 +L(unalign_qwloop): + lvx v4, 0, rSTR1 + lvx v10, rSTR2, r8 +#ifdef __LITTLE_ENDIAN__ + vperm v5, v10, v5, v6 +#else + vperm v5, v5, v10, v6 +#endif + vcmpequb. v7, v5, v4 + bnl cr6, L(different2) + vor v5, v10, v10 + lvx v4, rSTR1, r8 + lvx v10, rSTR2, r10 +#ifdef __LITTLE_ENDIAN__ + vperm v5, v10, v5, v6 +#else + vperm v5, v5, v10, v6 +#endif + vcmpequb. v7, v5, v4 + bnl cr6, L(different2) + vor v5, v10, v10 + lvx v4, rSTR1, r10 + lvx v10, rSTR2, r11 +#ifdef __LITTLE_ENDIAN__ + vperm v5, v10, v5, v6 +#else + vperm v5, v5, v10, v6 +#endif + vcmpequb. v7, v5, v4 + bnl cr6, L(different2) + vor v5, v10, v10 + lvx v4, rSTR1, r11 + lvx v10, rSTR2, r9 +#ifdef __LITTLE_ENDIAN__ + vperm v5, v10, v5, v6 +#else + vperm v5, v5, v10, v6 +#endif + vcmpequb. v7, v5, v4 + bnl cr6, L(different2) + vor v5, v10, v10 + addi rSTR1, rSTR1, 64 + addi rSTR2, rSTR2, 64 + bdnz L(unalign_qwloop) + clrldi rN, rN, 58 + /* Handle remainder for unaligned loop. */ + .align 4 +L(lessthan64_unalign): + mr r9, rSTR1 + cmpdi cr6, rN, 0 + li rSTR1, 0 + blelr cr6 + lvx v4, 0, r9 + lvx v10, rSTR2, r8 +#ifdef __LITTLE_ENDIAN__ + vperm v5, v10, v5, v6 +#else + vperm v5, v5, v10, v6 +#endif + vcmpequb. v7, v5, v4 + bnl cr6, L(different1) + vor v5, v10, v10 + addi rN, rN, -16 + + cmpdi cr6, rN, 0 + blelr cr6 + lvx v4, r9, r8 + lvx v10, rSTR2, r10 +#ifdef __LITTLE_ENDIAN__ + vperm v5, v10, v5, v6 +#else + vperm v5, v5, v10, v6 +#endif + vcmpequb. v7, v5, v4 + bnl cr6, L(different1) + vor v5, v10, v10 + addi rN, rN, -16 + + cmpdi cr6, rN, 0 + blelr cr6 + lvx v4, r9, r10 + lvx v10, rSTR2, r11 +#ifdef __LITTLE_ENDIAN__ + vperm v5, v10, v5, v6 +#else + vperm v5, v5, v10, v6 +#endif + vcmpequb. v7, v5, v4 + bnl cr6, L(different1) + vor v5, v10, v10 + addi rN, rN, -16 + + cmpdi cr6, rN, 0 + blelr cr6 + lvx v4, r9, r11 + addi r11, r11, 16 + lvx v10, rSTR2, r11 +#ifdef __LITTLE_ENDIAN__ + vperm v5, v10, v5, v6 +#else + vperm v5, v5, v10, v6 +#endif + vcmpequb. v7, v5, v4 + bnl cr6, L(different1) + blr + +/* Otherwise we know the two strings have the same alignment (but not + yet DW). So we force the string addresses to the next lower DW + boundary and special case this first DW using shift left to + eliminate bits preceding the first byte. Since we want to join the + normal (DW aligned) compare loop, starting at the second double word, + we need to adjust the length (rN) and special case the loop + versioning for the first DW. This ensures that the loop count is + correct and the first DW (shifted) is in the expected register pair. */ + .align 4 +L(DW): + std rWORD8, rWORD8SAVE(r1) + std rWORD7, rWORD7SAVE(r1) + std rOFF8, rOFF8SAVE(r1) + std rOFF16, rOFF16SAVE(r1) + std rOFF24, rOFF24SAVE(r1) + std rOFF32, rOFF32SAVE(r1) + cfi_offset(rWORD8, rWORD8SAVE) + cfi_offset(rWORD7, rWORD7SAVE) + cfi_offset(rOFF8, rOFF8SAVE) + cfi_offset(rOFF16, rOFF16SAVE) + cfi_offset(rOFF24, rOFF24SAVE) + cfi_offset(rOFF32, rOFF32SAVE) + + li rOFF8,8 + li rOFF16,16 + li rOFF24,24 + li rOFF32,32 + clrrdi rSTR1, rSTR1, 3 + clrrdi rSTR2, rSTR2, 3 + beq cr5, L(DWaligned) + add rN, rN, r12 + sldi rWORD6, r12, 3 + srdi r0, rN, 5 /* Divide by 32. */ + andi. r12, rN, 24 /* Get the DW remainder. */ + LD rWORD1, 0, rSTR1 + LD rWORD2, 0, rSTR2 + cmpldi cr1, r12, 16 + cmpldi cr7, rN, 32 + clrldi rN, rN, 61 + beq L(dPs4) + mtctr r0 + bgt cr1, L(dPs3) + beq cr1, L(dPs2) + +/* Remainder is 8. */ + .align 3 +L(dsP1): + sld rWORD5, rWORD1, rWORD6 + sld rWORD6, rWORD2, rWORD6 + cmpld cr5, rWORD5, rWORD6 + blt cr7, L(dP1x) +/* Do something useful in this cycle since we have to branch anyway. */ + LD rWORD1, rOFF8, rSTR1 + LD rWORD2, rOFF8, rSTR2 + cmpld cr7, rWORD1, rWORD2 + b L(dP1e) +/* Remainder is 16. */ + .align 4 +L(dPs2): + sld rWORD5, rWORD1, rWORD6 + sld rWORD6, rWORD2, rWORD6 + cmpld cr6, rWORD5, rWORD6 + blt cr7, L(dP2x) +/* Do something useful in this cycle since we have to branch anyway. */ + LD rWORD7, rOFF8, rSTR1 + LD rWORD8, rOFF8, rSTR2 + cmpld cr5, rWORD7, rWORD8 + b L(dP2e) +/* Remainder is 24. */ + .align 4 +L(dPs3): + sld rWORD3, rWORD1, rWORD6 + sld rWORD4, rWORD2, rWORD6 + cmpld cr1, rWORD3, rWORD4 + b L(dP3e) +/* Count is a multiple of 32, remainder is 0. */ + .align 4 +L(dPs4): + mtctr r0 + sld rWORD1, rWORD1, rWORD6 + sld rWORD2, rWORD2, rWORD6 + cmpld cr7, rWORD1, rWORD2 + b L(dP4e) + +/* At this point we know both strings are double word aligned and the + compare length is at least 8 bytes. */ + .align 4 +L(DWaligned): + andi. r12, rN, 24 /* Get the DW remainder. */ + srdi r0, rN, 5 /* Divide by 32. */ + cmpldi cr1, r12, 16 + cmpldi cr7, rN, 32 + clrldi rN, rN, 61 + beq L(dP4) + bgt cr1, L(dP3) + beq cr1, L(dP2) + +/* Remainder is 8. */ + .align 4 +L(dP1): + mtctr r0 +/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early + (8-15 byte compare), we want to use only volatile registers. This + means we can avoid restoring non-volatile registers since we did not + change any on the early exit path. The key here is the non-early + exit path only cares about the condition code (cr5), not about which + register pair was used. */ + LD rWORD5, 0, rSTR1 + LD rWORD6, 0, rSTR2 + cmpld cr5, rWORD5, rWORD6 + blt cr7, L(dP1x) + LD rWORD1, rOFF8, rSTR1 + LD rWORD2, rOFF8, rSTR2 + cmpld cr7, rWORD1, rWORD2 +L(dP1e): + LD rWORD3, rOFF16, rSTR1 + LD rWORD4, rOFF16, rSTR2 + cmpld cr1, rWORD3, rWORD4 + LD rWORD5, rOFF24, rSTR1 + LD rWORD6, rOFF24, rSTR2 + cmpld cr6, rWORD5, rWORD6 + bne cr5, L(dLcr5x) + bne cr7, L(dLcr7x) + + LD rWORD7, rOFF32, rSTR1 + LD rWORD8, rOFF32, rSTR2 + addi rSTR1, rSTR1, 32 + addi rSTR2, rSTR2, 32 + bne cr1, L(dLcr1) + cmpld cr5, rWORD7, rWORD8 + bdnz L(dLoop) + bne cr6, L(dLcr6) + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) + .align 3 +L(dP1x): + sldi. r12, rN, 3 + bne cr5, L(dLcr5x) + subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ + bne L(d00) + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) + li rRTN, 0 + blr + +/* Remainder is 16. */ + .align 4 +L(dP2): + mtctr r0 + LD rWORD5, 0, rSTR1 + LD rWORD6, 0, rSTR2 + cmpld cr6, rWORD5, rWORD6 + blt cr7, L(dP2x) + LD rWORD7, rOFF8, rSTR1 + LD rWORD8, rOFF8, rSTR2 + cmpld cr5, rWORD7, rWORD8 +L(dP2e): + LD rWORD1, rOFF16, rSTR1 + LD rWORD2, rOFF16, rSTR2 + cmpld cr7, rWORD1, rWORD2 + LD rWORD3, rOFF24, rSTR1 + LD rWORD4, rOFF24, rSTR2 + cmpld cr1, rWORD3, rWORD4 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 + bne cr6, L(dLcr6) + bne cr5, L(dLcr5) + b L(dLoop2) + .align 4 +L(dP2x): + LD rWORD3, rOFF8, rSTR1 + LD rWORD4, rOFF8, rSTR2 + cmpld cr1, rWORD3, rWORD4 + sldi. r12, rN, 3 + bne cr6, L(dLcr6x) + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 + bne cr1, L(dLcr1x) + subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ + bne L(d00) + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) + li rRTN, 0 + blr + +/* Remainder is 24. */ + .align 4 +L(dP3): + mtctr r0 + LD rWORD3, 0, rSTR1 + LD rWORD4, 0, rSTR2 + cmpld cr1, rWORD3, rWORD4 +L(dP3e): + LD rWORD5, rOFF8, rSTR1 + LD rWORD6, rOFF8, rSTR2 + cmpld cr6, rWORD5, rWORD6 + blt cr7, L(dP3x) + LD rWORD7, rOFF16, rSTR1 + LD rWORD8, rOFF16, rSTR2 + cmpld cr5, rWORD7, rWORD8 + LD rWORD1, rOFF24, rSTR1 + LD rWORD2, rOFF24, rSTR2 + cmpld cr7, rWORD1, rWORD2 + addi rSTR1, rSTR1, 16 + addi rSTR2, rSTR2, 16 + bne cr1, L(dLcr1) + bne cr6, L(dLcr6) + b L(dLoop1) +/* Again we are on a early exit path (24-31 byte compare), we want to + only use volatile registers and avoid restoring non-volatile + registers. */ + .align 4 +L(dP3x): + LD rWORD1, rOFF16, rSTR1 + LD rWORD2, rOFF16, rSTR2 + cmpld cr7, rWORD1, rWORD2 + sldi. r12, rN, 3 + bne cr1, L(dLcr1x) + addi rSTR1, rSTR1, 16 + addi rSTR2, rSTR2, 16 + bne cr6, L(dLcr6x) + subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ + bne cr7, L(dLcr7x) + bne L(d00) + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) + li rRTN, 0 + blr + +/* Count is a multiple of 32, remainder is 0. */ + .align 4 +L(dP4): + mtctr r0 + LD rWORD1, 0, rSTR1 + LD rWORD2, 0, rSTR2 + cmpld cr7, rWORD1, rWORD2 +L(dP4e): + LD rWORD3, rOFF8, rSTR1 + LD rWORD4, rOFF8, rSTR2 + cmpld cr1, rWORD3, rWORD4 + LD rWORD5, rOFF16, rSTR1 + LD rWORD6, rOFF16, rSTR2 + cmpld cr6, rWORD5, rWORD6 + LD rWORD7, rOFF24, rSTR1 + LD rWORD8, rOFF24, rSTR2 + addi rSTR1, rSTR1, 24 + addi rSTR2, rSTR2, 24 + cmpld cr5, rWORD7, rWORD8 + bne cr7, L(dLcr7) + bne cr1, L(dLcr1) + bdz- L(d24) /* Adjust CTR as we start with +4. */ +/* This is the primary loop. */ + .align 4 +L(dLoop): + LD rWORD1, rOFF8, rSTR1 + LD rWORD2, rOFF8, rSTR2 + cmpld cr1, rWORD3, rWORD4 + bne cr6, L(dLcr6) +L(dLoop1): + LD rWORD3, rOFF16, rSTR1 + LD rWORD4, rOFF16, rSTR2 + cmpld cr6, rWORD5, rWORD6 + bne cr5, L(dLcr5) +L(dLoop2): + LD rWORD5, rOFF24, rSTR1 + LD rWORD6, rOFF24, rSTR2 + cmpld cr5, rWORD7, rWORD8 + bne cr7, L(dLcr7) +L(dLoop3): + LD rWORD7, rOFF32, rSTR1 + LD rWORD8, rOFF32, rSTR2 + addi rSTR1, rSTR1, 32 + addi rSTR2, rSTR2, 32 + bne cr1, L(dLcr1) + cmpld cr7, rWORD1, rWORD2 + bdnz L(dLoop) + +L(dL4): + cmpld cr1, rWORD3, rWORD4 + bne cr6, L(dLcr6) + cmpld cr6, rWORD5, rWORD6 + bne cr5, L(dLcr5) + cmpld cr5, rWORD7, rWORD8 +L(d44): + bne cr7, L(dLcr7) +L(d34): + bne cr1, L(dLcr1) +L(d24): + bne cr6, L(dLcr6) +L(d14): + sldi. r12, rN, 3 + bne cr5, L(dLcr5) +L(d04): + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) + subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ + beq L(duzeroLength) +/* At this point we have a remainder of 1 to 7 bytes to compare. Since + we are aligned it is safe to load the whole double word, and use + shift right double to eliminate bits beyond the compare length. */ +L(d00): + LD rWORD1, rOFF8, rSTR1 + LD rWORD2, rOFF8, rSTR2 + srd rWORD1, rWORD1, rN + srd rWORD2, rWORD2, rN + cmpld cr7, rWORD1, rWORD2 + bne cr7, L(dLcr7x) + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) + li rRTN, 0 + blr + + .align 4 +L(dLcr7): + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) +L(dLcr7x): + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) + li rRTN, 1 + bgtlr cr7 + li rRTN, -1 + blr + .align 4 +L(dLcr1): + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) +L(dLcr1x): + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) + li rRTN, 1 + bgtlr cr1 + li rRTN, -1 + blr + .align 4 +L(dLcr6): + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) +L(dLcr6x): + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) + li rRTN, 1 + bgtlr cr6 + li rRTN, -1 + blr + .align 4 +L(dLcr5): + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) +L(dLcr5x): + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) + li rRTN, 1 + bgtlr cr5 + li rRTN, -1 + blr + + .align 4 +L(bytealigned): + mtctr rN + +/* We need to prime this loop. This loop is swing modulo scheduled + to avoid pipe delays. The dependent instruction latencies (load to + compare to conditional branch) is 2 to 3 cycles. In this loop each + dispatch group ends in a branch and takes 1 cycle. Effectively + the first iteration of the loop only serves to load operands and + branches based on compares are delayed until the next loop. + + So we must precondition some registers and condition codes so that + we don't exit the loop early on the first iteration. */ + + lbz rWORD1, 0(rSTR1) + lbz rWORD2, 0(rSTR2) + bdz L(b11) + cmpld cr7, rWORD1, rWORD2 + lbz rWORD3, 1(rSTR1) + lbz rWORD4, 1(rSTR2) + bdz L(b12) + cmpld cr1, rWORD3, rWORD4 + lbzu rWORD5, 2(rSTR1) + lbzu rWORD6, 2(rSTR2) + bdz L(b13) + .align 4 +L(bLoop): + lbzu rWORD1, 1(rSTR1) + lbzu rWORD2, 1(rSTR2) + bne cr7, L(bLcr7) + + cmpld cr6, rWORD5, rWORD6 + bdz L(b3i) + + lbzu rWORD3, 1(rSTR1) + lbzu rWORD4, 1(rSTR2) + bne cr1, L(bLcr1) + + cmpld cr7, rWORD1, rWORD2 + bdz L(b2i) + + lbzu rWORD5, 1(rSTR1) + lbzu rWORD6, 1(rSTR2) + bne cr6, L(bLcr6) + + cmpld cr1, rWORD3, rWORD4 + bdnz L(bLoop) + +/* We speculatively loading bytes before we have tested the previous + bytes. But we must avoid overrunning the length (in the ctr) to + prevent these speculative loads from causing a segfault. In this + case the loop will exit early (before the all pending bytes are + tested. In this case we must complete the pending operations + before returning. */ +L(b1i): + bne cr7, L(bLcr7) + bne cr1, L(bLcr1) + b L(bx56) + .align 4 +L(b2i): + bne cr6, L(bLcr6) + bne cr7, L(bLcr7) + b L(bx34) + .align 4 +L(b3i): + bne cr1, L(bLcr1) + bne cr6, L(bLcr6) + b L(bx12) + .align 4 +L(bLcr7): + li rRTN, 1 + bgtlr cr7 + li rRTN, -1 + blr +L(bLcr1): + li rRTN, 1 + bgtlr cr1 + li rRTN, -1 + blr +L(bLcr6): + li rRTN, 1 + bgtlr cr6 + li rRTN, -1 + blr + +L(b13): + bne cr7, L(bx12) + bne cr1, L(bx34) +L(bx56): + sub rRTN, rWORD5, rWORD6 + blr + nop +L(b12): + bne cr7, L(bx12) +L(bx34): + sub rRTN, rWORD3, rWORD4 + blr +L(b11): +L(bx12): + sub rRTN, rWORD1, rWORD2 + blr + + .align 4 +L(zeroLength): + li rRTN, 0 + blr + + .align 4 +/* At this point we know the strings have different alignment and the + compare length is at least 8 bytes. r12 contains the low order + 3 bits of rSTR1 and cr5 contains the result of the logical compare + of r12 to 0. If r12 == 0 then rStr1 is double word + aligned and can perform the DWunaligned loop. + + Otherwise we know that rSTR1 is not already DW aligned yet. + So we can force the string addresses to the next lower DW + boundary and special case this first DW using shift left to + eliminate bits preceding the first byte. Since we want to join the + normal (DWaligned) compare loop, starting at the second double word, + we need to adjust the length (rN) and special case the loop + versioning for the first DW. This ensures that the loop count is + correct and the first DW (shifted) is in the expected resister pair. */ +L(unaligned): + std rWORD8, rWORD8SAVE(r1) + std rWORD7, rWORD7SAVE(r1) + std rOFF8, rOFF8SAVE(r1) + std rOFF16, rOFF16SAVE(r1) + std rOFF24, rOFF24SAVE(r1) + std rOFF32, rOFF32SAVE(r1) + cfi_offset(rWORD8, rWORD8SAVE) + cfi_offset(rWORD7, rWORD7SAVE) + cfi_offset(rOFF8, rOFF8SAVE) + cfi_offset(rOFF16, rOFF16SAVE) + cfi_offset(rOFF24, rOFF24SAVE) + cfi_offset(rOFF32, rOFF32SAVE) + li rOFF8,8 + li rOFF16,16 + li rOFF24,24 + li rOFF32,32 + std rSHL, rSHLSAVE(r1) + cfi_offset(rSHL, rSHLSAVE) + clrldi rSHL, rSTR2, 61 + beq cr6, L(duzeroLength) + std rSHR, rSHRSAVE(r1) + cfi_offset(rSHR, rSHRSAVE) + beq cr5, L(DWunaligned) + std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) + cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE) +/* Adjust the logical start of rSTR2 to compensate for the extra bits + in the 1st rSTR1 DW. */ + sub rWORD8_SHIFT, rSTR2, r12 +/* But do not attempt to address the DW before that DW that contains + the actual start of rSTR2. */ + clrrdi rSTR2, rSTR2, 3 + std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) +/* Compute the left/right shift counts for the unaligned rSTR2, + compensating for the logical (DW aligned) start of rSTR1. */ + clrldi rSHL, rWORD8_SHIFT, 61 + clrrdi rSTR1, rSTR1, 3 + std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) + sldi rSHL, rSHL, 3 + cmpld cr5, rWORD8_SHIFT, rSTR2 + add rN, rN, r12 + sldi rWORD6, r12, 3 + std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) + cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE) + cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE) + cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE) + subfic rSHR, rSHL, 64 + srdi r0, rN, 5 /* Divide by 32. */ + andi. r12, rN, 24 /* Get the DW remainder. */ +/* We normally need to load 2 DWs to start the unaligned rSTR2, but in + this special case those bits may be discarded anyway. Also we + must avoid loading a DW where none of the bits are part of rSTR2 as + this may cross a page boundary and cause a page fault. */ + li rWORD8, 0 + blt cr5, L(dus0) + LD rWORD8, 0, rSTR2 + addi rSTR2, rSTR2, 8 + sld rWORD8, rWORD8, rSHL + +L(dus0): + LD rWORD1, 0, rSTR1 + LD rWORD2, 0, rSTR2 + cmpldi cr1, r12, 16 + cmpldi cr7, rN, 32 + srd r12, rWORD2, rSHR + clrldi rN, rN, 61 + beq L(duPs4) + mtctr r0 + or rWORD8, r12, rWORD8 + bgt cr1, L(duPs3) + beq cr1, L(duPs2) + +/* Remainder is 8. */ + .align 4 +L(dusP1): + sld rWORD8_SHIFT, rWORD2, rSHL + sld rWORD7, rWORD1, rWORD6 + sld rWORD8, rWORD8, rWORD6 + bge cr7, L(duP1e) +/* At this point we exit early with the first double word compare + complete and remainder of 0 to 7 bytes. See L(du14) for details on + how we handle the remaining bytes. */ + cmpld cr5, rWORD7, rWORD8 + sldi. rN, rN, 3 + bne cr5, L(duLcr5) + cmpld cr7, rN, rSHR + beq L(duZeroReturn) + li r0, 0 + ble cr7, L(dutrim) + LD rWORD2, rOFF8, rSTR2 + srd r0, rWORD2, rSHR + b L(dutrim) +/* Remainder is 16. */ + .align 4 +L(duPs2): + sld rWORD6_SHIFT, rWORD2, rSHL + sld rWORD5, rWORD1, rWORD6 + sld rWORD6, rWORD8, rWORD6 + b L(duP2e) +/* Remainder is 24. */ + .align 4 +L(duPs3): + sld rWORD4_SHIFT, rWORD2, rSHL + sld rWORD3, rWORD1, rWORD6 + sld rWORD4, rWORD8, rWORD6 + b L(duP3e) +/* Count is a multiple of 32, remainder is 0. */ + .align 4 +L(duPs4): + mtctr r0 + or rWORD8, r12, rWORD8 + sld rWORD2_SHIFT, rWORD2, rSHL + sld rWORD1, rWORD1, rWORD6 + sld rWORD2, rWORD8, rWORD6 + b L(duP4e) + +/* At this point we know rSTR1 is double word aligned and the + compare length is at least 8 bytes. */ + .align 4 +L(DWunaligned): + std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) + clrrdi rSTR2, rSTR2, 3 + std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) + srdi r0, rN, 5 /* Divide by 32. */ + std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) + andi. r12, rN, 24 /* Get the DW remainder. */ + std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) + cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE) + cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE) + cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE) + cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE) + sldi rSHL, rSHL, 3 + LD rWORD6, 0, rSTR2 + LD rWORD8, rOFF8, rSTR2 + addi rSTR2, rSTR2, 8 + cmpldi cr1, r12, 16 + cmpldi cr7, rN, 32 + clrldi rN, rN, 61 + subfic rSHR, rSHL, 64 + sld rWORD6_SHIFT, rWORD6, rSHL + beq L(duP4) + mtctr r0 + bgt cr1, L(duP3) + beq cr1, L(duP2) + +/* Remainder is 8. */ + .align 4 +L(duP1): + srd r12, rWORD8, rSHR + LD rWORD7, 0, rSTR1 + sld rWORD8_SHIFT, rWORD8, rSHL + or rWORD8, r12, rWORD6_SHIFT + blt cr7, L(duP1x) +L(duP1e): + LD rWORD1, rOFF8, rSTR1 + LD rWORD2, rOFF8, rSTR2 + cmpld cr5, rWORD7, rWORD8 + srd r0, rWORD2, rSHR + sld rWORD2_SHIFT, rWORD2, rSHL + or rWORD2, r0, rWORD8_SHIFT + LD rWORD3, rOFF16, rSTR1 + LD rWORD4, rOFF16, rSTR2 + cmpld cr7, rWORD1, rWORD2 + srd r12, rWORD4, rSHR + sld rWORD4_SHIFT, rWORD4, rSHL + bne cr5, L(duLcr5) + or rWORD4, r12, rWORD2_SHIFT + LD rWORD5, rOFF24, rSTR1 + LD rWORD6, rOFF24, rSTR2 + cmpld cr1, rWORD3, rWORD4 + srd r0, rWORD6, rSHR + sld rWORD6_SHIFT, rWORD6, rSHL + bne cr7, L(duLcr7) + or rWORD6, r0, rWORD4_SHIFT + cmpld cr6, rWORD5, rWORD6 + b L(duLoop3) + .align 4 +/* At this point we exit early with the first double word compare + complete and remainder of 0 to 7 bytes. See L(du14) for details on + how we handle the remaining bytes. */ +L(duP1x): + cmpld cr5, rWORD7, rWORD8 + sldi. rN, rN, 3 + bne cr5, L(duLcr5) + cmpld cr7, rN, rSHR + beq L(duZeroReturn) + li r0, 0 + ble cr7, L(dutrim) + LD rWORD2, rOFF8, rSTR2 + srd r0, rWORD2, rSHR + b L(dutrim) +/* Remainder is 16. */ + .align 4 +L(duP2): + srd r0, rWORD8, rSHR + LD rWORD5, 0, rSTR1 + or rWORD6, r0, rWORD6_SHIFT + sld rWORD6_SHIFT, rWORD8, rSHL +L(duP2e): + LD rWORD7, rOFF8, rSTR1 + LD rWORD8, rOFF8, rSTR2 + cmpld cr6, rWORD5, rWORD6 + srd r12, rWORD8, rSHR + sld rWORD8_SHIFT, rWORD8, rSHL + or rWORD8, r12, rWORD6_SHIFT + blt cr7, L(duP2x) + LD rWORD1, rOFF16, rSTR1 + LD rWORD2, rOFF16, rSTR2 + cmpld cr5, rWORD7, rWORD8 + bne cr6, L(duLcr6) + srd r0, rWORD2, rSHR + sld rWORD2_SHIFT, rWORD2, rSHL + or rWORD2, r0, rWORD8_SHIFT + LD rWORD3, rOFF24, rSTR1 + LD rWORD4, rOFF24, rSTR2 + cmpld cr7, rWORD1, rWORD2 + bne cr5, L(duLcr5) + srd r12, rWORD4, rSHR + sld rWORD4_SHIFT, rWORD4, rSHL + or rWORD4, r12, rWORD2_SHIFT + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 + cmpld cr1, rWORD3, rWORD4 + b L(duLoop2) + .align 4 +L(duP2x): + cmpld cr5, rWORD7, rWORD8 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 + bne cr6, L(duLcr6) + sldi. rN, rN, 3 + bne cr5, L(duLcr5) + cmpld cr7, rN, rSHR + beq L(duZeroReturn) + li r0, 0 + ble cr7, L(dutrim) + LD rWORD2, rOFF8, rSTR2 + srd r0, rWORD2, rSHR + b L(dutrim) + +/* Remainder is 24. */ + .align 4 +L(duP3): + srd r12, rWORD8, rSHR + LD rWORD3, 0, rSTR1 + sld rWORD4_SHIFT, rWORD8, rSHL + or rWORD4, r12, rWORD6_SHIFT +L(duP3e): + LD rWORD5, rOFF8, rSTR1 + LD rWORD6, rOFF8, rSTR2 + cmpld cr1, rWORD3, rWORD4 + srd r0, rWORD6, rSHR + sld rWORD6_SHIFT, rWORD6, rSHL + or rWORD6, r0, rWORD4_SHIFT + LD rWORD7, rOFF16, rSTR1 + LD rWORD8, rOFF16, rSTR2 + cmpld cr6, rWORD5, rWORD6 + bne cr1, L(duLcr1) + srd r12, rWORD8, rSHR + sld rWORD8_SHIFT, rWORD8, rSHL + or rWORD8, r12, rWORD6_SHIFT + blt cr7, L(duP3x) + LD rWORD1, rOFF24, rSTR1 + LD rWORD2, rOFF24, rSTR2 + cmpld cr5, rWORD7, rWORD8 + bne cr6, L(duLcr6) + srd r0, rWORD2, rSHR + sld rWORD2_SHIFT, rWORD2, rSHL + or rWORD2, r0, rWORD8_SHIFT + addi rSTR1, rSTR1, 16 + addi rSTR2, rSTR2, 16 + cmpld cr7, rWORD1, rWORD2 + b L(duLoop1) + .align 4 +L(duP3x): + addi rSTR1, rSTR1, 16 + addi rSTR2, rSTR2, 16 + cmpld cr5, rWORD7, rWORD8 + bne cr6, L(duLcr6) + sldi. rN, rN, 3 + bne cr5, L(duLcr5) + cmpld cr7, rN, rSHR + beq L(duZeroReturn) + li r0, 0 + ble cr7, L(dutrim) + LD rWORD2, rOFF8, rSTR2 + srd r0, rWORD2, rSHR + b L(dutrim) + +/* Count is a multiple of 32, remainder is 0. */ + .align 4 +L(duP4): + mtctr r0 + srd r0, rWORD8, rSHR + LD rWORD1, 0, rSTR1 + sld rWORD2_SHIFT, rWORD8, rSHL + or rWORD2, r0, rWORD6_SHIFT +L(duP4e): + LD rWORD3, rOFF8, rSTR1 + LD rWORD4, rOFF8, rSTR2 + cmpld cr7, rWORD1, rWORD2 + srd r12, rWORD4, rSHR + sld rWORD4_SHIFT, rWORD4, rSHL + or rWORD4, r12, rWORD2_SHIFT + LD rWORD5, rOFF16, rSTR1 + LD rWORD6, rOFF16, rSTR2 + cmpld cr1, rWORD3, rWORD4 + bne cr7, L(duLcr7) + srd r0, rWORD6, rSHR + sld rWORD6_SHIFT, rWORD6, rSHL + or rWORD6, r0, rWORD4_SHIFT + LD rWORD7, rOFF24, rSTR1 + LD rWORD8, rOFF24, rSTR2 + addi rSTR1, rSTR1, 24 + addi rSTR2, rSTR2, 24 + cmpld cr6, rWORD5, rWORD6 + bne cr1, L(duLcr1) + srd r12, rWORD8, rSHR + sld rWORD8_SHIFT, rWORD8, rSHL + or rWORD8, r12, rWORD6_SHIFT + cmpld cr5, rWORD7, rWORD8 + bdz L(du24) /* Adjust CTR as we start with +4. */ +/* This is the primary loop. */ + .align 4 +L(duLoop): + LD rWORD1, rOFF8, rSTR1 + LD rWORD2, rOFF8, rSTR2 + cmpld cr1, rWORD3, rWORD4 + bne cr6, L(duLcr6) + srd r0, rWORD2, rSHR + sld rWORD2_SHIFT, rWORD2, rSHL + or rWORD2, r0, rWORD8_SHIFT +L(duLoop1): + LD rWORD3, rOFF16, rSTR1 + LD rWORD4, rOFF16, rSTR2 + cmpld cr6, rWORD5, rWORD6 + bne cr5, L(duLcr5) + srd r12, rWORD4, rSHR + sld rWORD4_SHIFT, rWORD4, rSHL + or rWORD4, r12, rWORD2_SHIFT +L(duLoop2): + LD rWORD5, rOFF24, rSTR1 + LD rWORD6, rOFF24, rSTR2 + cmpld cr5, rWORD7, rWORD8 + bne cr7, L(duLcr7) + srd r0, rWORD6, rSHR + sld rWORD6_SHIFT, rWORD6, rSHL + or rWORD6, r0, rWORD4_SHIFT +L(duLoop3): + LD rWORD7, rOFF32, rSTR1 + LD rWORD8, rOFF32, rSTR2 + addi rSTR1, rSTR1, 32 + addi rSTR2, rSTR2, 32 + cmpld cr7, rWORD1, rWORD2 + bne cr1, L(duLcr1) + srd r12, rWORD8, rSHR + sld rWORD8_SHIFT, rWORD8, rSHL + or rWORD8, r12, rWORD6_SHIFT + bdnz L(duLoop) + +L(duL4): + cmpld cr1, rWORD3, rWORD4 + bne cr6, L(duLcr6) + cmpld cr6, rWORD5, rWORD6 + bne cr5, L(duLcr5) + cmpld cr5, rWORD7, rWORD8 +L(du44): + bne cr7, L(duLcr7) +L(du34): + bne cr1, L(duLcr1) +L(du24): + bne cr6, L(duLcr6) +L(du14): + sldi. rN, rN, 3 + bne cr5, L(duLcr5) +/* At this point we have a remainder of 1 to 7 bytes to compare. We use + shift right double to eliminate bits beyond the compare length. + + However it may not be safe to load rWORD2 which may be beyond the + string length. So we compare the bit length of the remainder to + the right shift count (rSHR). If the bit count is less than or equal + we do not need to load rWORD2 (all significant bits are already in + rWORD8_SHIFT). */ + cmpld cr7, rN, rSHR + beq L(duZeroReturn) + li r0, 0 + ble cr7, L(dutrim) + LD rWORD2, rOFF8, rSTR2 + srd r0, rWORD2, rSHR + .align 4 +L(dutrim): + LD rWORD1, rOFF8, rSTR1 + ld rWORD8, -8(r1) + subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */ + or rWORD2, r0, rWORD8_SHIFT + ld rWORD7, rWORD7SAVE(r1) + ld rSHL, rSHLSAVE(r1) + srd rWORD1, rWORD1, rN + srd rWORD2, rWORD2, rN + ld rSHR, rSHRSAVE(r1) + ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) + li rRTN, 0 + cmpld cr7, rWORD1, rWORD2 + ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) + ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) + beq cr7, L(dureturn24) + li rRTN, 1 + ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) + bgtlr cr7 + li rRTN, -1 + blr + .align 4 +L(duLcr7): + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) + li rRTN, 1 + bgt cr7, L(dureturn29) + ld rSHL, rSHLSAVE(r1) + ld rSHR, rSHRSAVE(r1) + li rRTN, -1 + b L(dureturn27) + .align 4 +L(duLcr1): + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) + li rRTN, 1 + bgt cr1, L(dureturn29) + ld rSHL, rSHLSAVE(r1) + ld rSHR, rSHRSAVE(r1) + li rRTN, -1 + b L(dureturn27) + .align 4 +L(duLcr6): + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) + li rRTN, 1 + bgt cr6, L(dureturn29) + ld rSHL, rSHLSAVE(r1) + ld rSHR, rSHRSAVE(r1) + li rRTN, -1 + b L(dureturn27) + .align 4 +L(duLcr5): + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) + li rRTN, 1 + bgt cr5, L(dureturn29) + ld rSHL, rSHLSAVE(r1) + ld rSHR, rSHRSAVE(r1) + li rRTN, -1 + b L(dureturn27) + + .align 3 +L(duZeroReturn): + li rRTN, 0 + .align 4 +L(dureturn): + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) +L(dureturn29): + ld rSHL, rSHLSAVE(r1) + ld rSHR, rSHRSAVE(r1) +L(dureturn27): + ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) + ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) + ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) +L(dureturn24): + ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) + blr + +L(duzeroLength): + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) + li rRTN, 0 + blr + +END (MEMCMP) +libc_hidden_builtin_def (memcmp) +weak_alias (memcmp, bcmp) diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S new file mode 100644 index 0000000000..bc734c9f4f --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S @@ -0,0 +1,458 @@ +/* Optimized memset implementation for PowerPC64/POWER8. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#define MTVSRD_V1_R4 .long 0x7c240166 /* mtvsrd v1,r4 */ + +/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5])); + Returns 's'. */ + +#ifndef MEMSET +# define MEMSET memset +#endif + + /* No need to use .machine power8 since mtvsrd is already + handled by the define. It avoid breakage on binutils + that does not support this machine specifier. */ + .machine power7 +EALIGN (MEMSET, 5, 0) + CALL_MCOUNT 3 + +L(_memset): + cmpldi cr7,r5,31 + neg r0,r3 + mr r10,r3 + + insrdi r4,r4,8,48 + insrdi r4,r4,16,32 /* Replicate byte to word. */ + ble cr7,L(write_LT_32) + + andi. r11,r10,15 /* Check alignment of DST. */ + insrdi r4,r4,32,0 /* Replicate word to double word. */ + + beq L(big_aligned) + + mtocrf 0x01,r0 + clrldi r0,r0,60 + + /* Get DST aligned to 16 bytes. */ +1: bf 31,2f + stb r4,0(r10) + addi r10,r10,1 + +2: bf 30,4f + sth r4,0(r10) + addi r10,r10,2 + +4: bf 29,8f + stw r4,0(r10) + addi r10,r10,4 + +8: bf 28,16f + std r4,0(r10) + addi r10,r10,8 + +16: subf r5,r0,r5 + + .align 4 +L(big_aligned): + /* For sizes larger than 255 two possible paths: + - if constant is '0', zero full cache lines with dcbz + - otherwise uses vector instructions. */ + cmpldi cr5,r5,255 + dcbtst 0,r10 + cmpldi cr6,r4,0 + crand 27,26,21 + bt 27,L(huge_dcbz) + bge cr5,L(huge_vector) + + + /* Size between 32 and 255 bytes with constant different than 0, use + doubleword store instruction to achieve best throughput. */ + srdi r8,r5,5 + clrldi r11,r5,59 + cmpldi cr6,r11,0 + cmpdi r8,0 + beq L(tail_bytes) + mtctr r8 + + /* Main aligned write loop, writes 32-bytes at a time. */ + .align 4 +L(big_loop): + std r4,0(r10) + std r4,8(r10) + std r4,16(r10) + std r4,24(r10) + addi r10,r10,32 + bdz L(tail_bytes) + + std r4,0(r10) + std r4,8(r10) + std r4,16(r10) + std r4,24(r10) + addi r10,10,32 + bdnz L(big_loop) + + b L(tail_bytes) + + /* Write remaining 1~31 bytes. */ + .align 4 +L(tail_bytes): + beqlr cr6 + + srdi r7,r11,4 + clrldi r8,r11,60 + mtocrf 0x01,r7 + + .align 4 + bf 31,8f + std r4,0(r10) + std r4,8(r10) + addi r10,r10,16 + + .align 4 +8: mtocrf 0x1,r8 + bf 28,4f + std r4,0(r10) + addi r10,r10,8 + + .align 4 +4: bf 29,2f + stw 4,0(10) + addi 10,10,4 + + .align 4 +2: bf 30,1f + sth 4,0(10) + addi 10,10,2 + + .align 4 +1: bflr 31 + stb 4,0(10) + blr + + /* Size larger than 255 bytes with constant different than 0, use + vector instruction to achieve best throughput. */ +L(huge_vector): + /* Replicate set byte to quadword in VMX register. */ + MTVSRD_V1_R4 + xxpermdi 32,v0,v1,0 + vspltb v2,v0,15 + + /* Main aligned write loop: 128 bytes at a time. */ + li r6,16 + li r7,32 + li r8,48 + mtocrf 0x02,r5 + srdi r12,r5,7 + cmpdi r12,0 + beq L(aligned_tail) + mtctr r12 + b L(aligned_128loop) + + .align 4 +L(aligned_128loop): + stvx v2,0,r10 + stvx v2,r10,r6 + stvx v2,r10,r7 + stvx v2,r10,r8 + addi r10,r10,64 + stvx v2,0,r10 + stvx v2,r10,r6 + stvx v2,r10,r7 + stvx v2,r10,r8 + addi r10,r10,64 + bdnz L(aligned_128loop) + + /* Write remaining 1~127 bytes. */ +L(aligned_tail): + mtocrf 0x01,r5 + bf 25,32f + stvx v2,0,r10 + stvx v2,r10,r6 + stvx v2,r10,r7 + stvx v2,r10,r8 + addi r10,r10,64 + +32: bf 26,16f + stvx v2,0,r10 + stvx v2,r10,r6 + addi r10,r10,32 + +16: bf 27,8f + stvx v2,0,r10 + addi r10,r10,16 + +8: bf 28,4f + std r4,0(r10) + addi r10,r10,8 + + /* Copies 4~7 bytes. */ +4: bf 29,L(tail2) + stw r4,0(r10) + bf 30,L(tail5) + sth r4,4(r10) + bflr 31 + stb r4,6(r10) + /* Return original DST pointer. */ + blr + + /* Special case when value is 0 and we have a long length to deal + with. Use dcbz to zero out a full cacheline of 128 bytes at a time. + Before using dcbz though, we need to get the destination 128-byte + aligned. */ + .align 4 +L(huge_dcbz): + andi. r11,r10,127 + neg r0,r10 + beq L(huge_dcbz_aligned) + + clrldi r0,r0,57 + subf r5,r0,r5 + srdi r0,r0,3 + mtocrf 0x01,r0 + + /* Write 1~128 bytes until DST is aligned to 128 bytes. */ +8: bf 28,4f + + std r4,0(r10) + std r4,8(r10) + std r4,16(r10) + std r4,24(r10) + std r4,32(r10) + std r4,40(r10) + std r4,48(r10) + std r4,56(r10) + addi r10,r10,64 + + .align 4 +4: bf 29,2f + std r4,0(r10) + std r4,8(r10) + std r4,16(r10) + std r4,24(r10) + addi r10,r10,32 + + .align 4 +2: bf 30,1f + std r4,0(r10) + std r4,8(r10) + addi r10,r10,16 + + .align 4 +1: bf 31,L(huge_dcbz_aligned) + std r4,0(r10) + addi r10,r10,8 + +L(huge_dcbz_aligned): + /* Setup dcbz unroll offsets and count numbers. */ + srdi r8,r5,9 + clrldi r11,r5,55 + cmpldi cr6,r11,0 + li r9,128 + cmpdi r8,0 + beq L(huge_tail) + li r7,256 + li r6,384 + mtctr r8 + + .align 4 +L(huge_loop): + /* Sets 512 bytes to zero in each iteration, the loop unrolling shows + a throughput boost for large sizes (2048 bytes or higher). */ + dcbz 0,r10 + dcbz r9,r10 + dcbz r7,r10 + dcbz r6,r10 + addi r10,r10,512 + bdnz L(huge_loop) + + beqlr cr6 + +L(huge_tail): + srdi r6,r11,8 + srdi r7,r11,4 + clrldi r8,r11,4 + cmpldi cr6,r8,0 + mtocrf 0x01,r6 + + beq cr6,L(tail) + + /* We have 1~511 bytes remaining. */ + .align 4 +32: bf 31,16f + dcbz 0,r10 + dcbz r9,r10 + addi r10,r10,256 + + .align 4 +16: mtocrf 0x01,r7 + bf 28,8f + dcbz 0,r10 + addi r10,r10,128 + + .align 4 +8: bf 29,4f + std r4,0(r10) + std r4,8(r10) + std r4,16(r10) + std r4,24(r10) + std r4,32(r10) + std r4,40(r10) + std r4,48(r10) + std r4,56(r10) + addi r10,r10,64 + + .align 4 +4: bf 30,2f + std r4,0(r10) + std r4,8(r10) + std r4,16(r10) + std r4,24(r10) + addi r10,r10,32 + + .align 4 +2: bf 31,L(tail) + std r4,0(r10) + std r4,8(r10) + addi r10,r10,16 + .align 4 + + /* Remaining 1~15 bytes. */ +L(tail): + mtocrf 0x01,r8 + + .align +8: bf 28,4f + std r4,0(r10) + addi r10,r10,8 + + .align 4 +4: bf 29,2f + stw r4,0(r10) + addi r10,r10,4 + + .align 4 +2: bf 30,1f + sth r4,0(r10) + addi r10,r10,2 + + .align 4 +1: bflr 31 + stb r4,0(r10) + blr + + /* Handle short copies of 0~31 bytes. Best throughput is achieved + by just unrolling all operations. */ + .align 4 +L(write_LT_32): + cmpldi cr6,5,8 + mtocrf 0x01,r5 + ble cr6,L(write_LE_8) + + /* At least 9 bytes to go. */ + neg r8,r4 + andi. r0,r8,3 + cmpldi cr1,r5,16 + beq L(write_LT_32_aligned) + + /* Force 4-byte alignment for SRC. */ + mtocrf 0x01,r0 + subf r5,r0,r5 + +2: bf 30,1f + sth r4,0(r10) + addi r10,r10,2 + +1: bf 31,L(end_4bytes_alignment) + stb r4,0(r10) + addi r10,r10,1 + + .align 4 +L(end_4bytes_alignment): + cmpldi cr1,r5,16 + mtocrf 0x01,r5 + +L(write_LT_32_aligned): + blt cr1,8f + + stw r4,0(r10) + stw r4,4(r10) + stw r4,8(r10) + stw r4,12(r10) + addi r10,r10,16 + +8: bf 28,L(tail4) + stw r4,0(r10) + stw r4,4(r10) + addi r10,r10,8 + + .align 4 + /* Copies 4~7 bytes. */ +L(tail4): + bf 29,L(tail2) + stw r4,0(r10) + bf 30,L(tail5) + sth r4,4(r10) + bflr 31 + stb r4,6(r10) + blr + + .align 4 + /* Copies 2~3 bytes. */ +L(tail2): + bf 30,1f + sth r4,0(r10) + bflr 31 + stb r4,2(r10) + blr + + .align 4 +L(tail5): + bflr 31 + stb r4,4(r10) + blr + + .align 4 +1: bflr 31 + stb r4,0(r10) + blr + + /* Handles copies of 0~8 bytes. */ + .align 4 +L(write_LE_8): + bne cr6,L(tail4) + + stw r4,0(r10) + stw r4,4(r10) + blr +END_GEN_TB (MEMSET,TB_TOCLESS) +libc_hidden_builtin_def (memset) + +/* Copied from bzero.S to prevent the linker from inserting a stub + between bzero and memset. */ +ENTRY (__bzero) + CALL_MCOUNT 3 + mr r5,r4 + li r4,0 + b L(_memset) +END (__bzero) +#ifndef __bzero +weak_alias (__bzero, bzero) +#endif diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/multiarch/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/multiarch/Implies new file mode 100644 index 0000000000..1fc7b7cd39 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/multiarch/Implies @@ -0,0 +1 @@ +powerpc/powerpc64/power7/multiarch diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/stpcpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/stpcpy.S new file mode 100644 index 0000000000..955e738cee --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/stpcpy.S @@ -0,0 +1,24 @@ +/* Optimized stpcpy implementation for PowerPC64/POWER8. + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define USE_AS_STPCPY +#include <sysdeps/powerpc/powerpc64/power8/strcpy.S> + +weak_alias (__stpcpy, stpcpy) +libc_hidden_def (__stpcpy) +libc_hidden_builtin_def (stpcpy) diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/stpncpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/stpncpy.S new file mode 100644 index 0000000000..c14d984dd0 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/stpncpy.S @@ -0,0 +1,24 @@ +/* Optimized stpncpy implementation for PowerPC64/POWER8. + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define USE_AS_STPNCPY +#include <sysdeps/powerpc/powerpc64/power8/strncpy.S> + +weak_alias (__stpncpy, stpncpy) +libc_hidden_def (__stpncpy) +libc_hidden_builtin_def (stpncpy) diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasecmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasecmp.S new file mode 100644 index 0000000000..88b17a6eb1 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasecmp.S @@ -0,0 +1,457 @@ +/* Optimized strcasecmp implementation for PowerPC64. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <locale-defines.h> + +/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) */ + +#ifndef USE_AS_STRNCASECMP +# define __STRCASECMP __strcasecmp +# define STRCASECMP strcasecmp +#else +# define __STRCASECMP __strncasecmp +# define STRCASECMP strncasecmp +#endif +/* Convert 16 bytes to lowercase and compare */ +#define TOLOWER() \ + vaddubm v8, v4, v1; \ + vaddubm v7, v4, v3; \ + vcmpgtub v8, v8, v2; \ + vsel v4, v7, v4, v8; \ + vaddubm v8, v5, v1; \ + vaddubm v7, v5, v3; \ + vcmpgtub v8, v8, v2; \ + vsel v5, v7, v5, v8; \ + vcmpequb. v7, v5, v4; + +/* + * Get 16 bytes for unaligned case. + * reg1: Vector to hold next 16 bytes. + * reg2: Address to read from. + * reg3: Permute control vector. + * v8: Tmp vector used to mask unwanted bytes. + * v9: Tmp vector,0 when null is found on first 16 bytes + */ +#ifdef __LITTLE_ENDIAN__ +#define GET16BYTES(reg1, reg2, reg3) \ + lvx reg1, 0, reg2; \ + vspltisb v8, -1; \ + vperm v8, v8, reg1, reg3; \ + vcmpequb. v8, v0, v8; \ + beq cr6, 1f; \ + vspltisb v9, 0; \ + b 2f; \ + .align 4; \ +1: \ + addi r6, reg2, 16; \ + lvx v9, 0, r6; \ +2: \ + vperm reg1, v9, reg1, reg3; +#else +#define GET16BYTES(reg1, reg2, reg3) \ + lvx reg1, 0, reg2; \ + vspltisb v8, -1; \ + vperm v8, reg1, v8, reg3; \ + vcmpequb. v8, v0, v8; \ + beq cr6, 1f; \ + vspltisb v9, 0; \ + b 2f; \ + .align 4; \ +1: \ + addi r6, reg2, 16; \ + lvx v9, 0, r6; \ +2: \ + vperm reg1, reg1, v9, reg3; +#endif + +/* Check null in v4, v5 and convert to lower. */ +#define CHECKNULLANDCONVERT() \ + vcmpequb. v7, v0, v5; \ + beq cr6, 3f; \ + vcmpequb. v7, v0, v4; \ + beq cr6, 3f; \ + b L(null_found); \ + .align 4; \ +3: \ + TOLOWER() + +#ifdef _ARCH_PWR8 +# define VCLZD_V8_v7 vclzd v8, v7; +# define MFVRD_R3_V1 mfvrd r3, v1; +# define VSUBUDM_V9_V8 vsubudm v9, v9, v8; +# define VPOPCNTD_V8_V8 vpopcntd v8, v8; +# define VADDUQM_V7_V8 vadduqm v9, v7, v8; +#else +# define VCLZD_V8_v7 .long 0x11003fc2 +# define MFVRD_R3_V1 .long 0x7c230067 +# define VSUBUDM_V9_V8 .long 0x112944c0 +# define VPOPCNTD_V8_V8 .long 0x110047c3 +# define VADDUQM_V7_V8 .long 0x11274100 +#endif + + .machine power7 + +ENTRY (__STRCASECMP) +#ifdef USE_AS_STRNCASECMP + CALL_MCOUNT 3 +#else + CALL_MCOUNT 2 +#endif +#define rRTN r3 /* Return value */ +#define rSTR1 r10 /* 1st string */ +#define rSTR2 r4 /* 2nd string */ +#define rCHAR1 r6 /* Byte read from 1st string */ +#define rCHAR2 r7 /* Byte read from 2nd string */ +#define rADDR1 r8 /* Address of tolower(rCHAR1) */ +#define rADDR2 r12 /* Address of tolower(rCHAR2) */ +#define rLWR1 r8 /* Word tolower(rCHAR1) */ +#define rLWR2 r12 /* Word tolower(rCHAR2) */ +#define rTMP r9 +#define rLOC r11 /* Default locale address */ + + cmpd cr7, rRTN, rSTR2 + + /* Get locale address. */ + ld rTMP, __libc_tsd_LOCALE@got@tprel(r2) + add rLOC, rTMP, __libc_tsd_LOCALE@tls + ld rLOC, 0(rLOC) + + mr rSTR1, rRTN + li rRTN, 0 + beqlr cr7 +#ifdef USE_AS_STRNCASECMP + cmpdi cr7, r5, 0 + beq cr7, L(retnull) + cmpdi cr7, r5, 16 + blt cr7, L(bytebybyte) +#endif + vspltisb v0, 0 + vspltisb v8, -1 + /* Check for null in initial characters. + Check max of 16 char depending on the alignment. + If null is present, proceed byte by byte. */ + lvx v4, 0, rSTR1 +#ifdef __LITTLE_ENDIAN__ + lvsr v10, 0, rSTR1 /* Compute mask. */ + vperm v9, v8, v4, v10 /* Mask bits that are not part of string. */ +#else + lvsl v10, 0, rSTR1 + vperm v9, v4, v8, v10 +#endif + vcmpequb. v9, v0, v9 /* Check for null bytes. */ + bne cr6, L(bytebybyte) + lvx v5, 0, rSTR2 + /* Calculate alignment. */ +#ifdef __LITTLE_ENDIAN__ + lvsr v6, 0, rSTR2 + vperm v9, v8, v5, v6 /* Mask bits that are not part of string. */ +#else + lvsl v6, 0, rSTR2 + vperm v9, v5, v8, v6 +#endif + vcmpequb. v9, v0, v9 /* Check for null bytes. */ + bne cr6, L(bytebybyte) + /* Check if locale has non ascii characters. */ + ld rTMP, 0(rLOC) + addi r6, rTMP,LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES + lwz rTMP, 0(r6) + cmpdi cr7, rTMP, 1 + beq cr7, L(bytebybyte) + + /* Load vector registers with values used for TOLOWER. */ + /* Load v1 = 0xbf, v2 = 0x19 v3 = 0x20 in each byte. */ + vspltisb v3, 2 + vspltisb v9, 4 + vsl v3, v3, v9 + vaddubm v1, v3, v3 + vnor v1, v1, v1 + vspltisb v2, 7 + vsububm v2, v3, v2 + + andi. rADDR1, rSTR1, 0xF + beq cr0, L(align) + addi r6, rSTR1, 16 + lvx v9, 0, r6 + /* Compute 16 bytes from previous two loads. */ +#ifdef __LITTLE_ENDIAN__ + vperm v4, v9, v4, v10 +#else + vperm v4, v4, v9, v10 +#endif +L(align): + andi. rADDR2, rSTR2, 0xF + beq cr0, L(align1) + addi r6, rSTR2, 16 + lvx v9, 0, r6 + /* Compute 16 bytes from previous two loads. */ +#ifdef __LITTLE_ENDIAN__ + vperm v5, v9, v5, v6 +#else + vperm v5, v5, v9, v6 +#endif +L(align1): + CHECKNULLANDCONVERT() + blt cr6, L(match) + b L(different) + .align 4 +L(match): + clrldi r6, rSTR1, 60 + subfic r7, r6, 16 +#ifdef USE_AS_STRNCASECMP + sub r5, r5, r7 +#endif + add rSTR1, rSTR1, r7 + add rSTR2, rSTR2, r7 + andi. rADDR2, rSTR2, 0xF + addi rSTR1, rSTR1, -16 + addi rSTR2, rSTR2, -16 + beq cr0, L(aligned) +#ifdef __LITTLE_ENDIAN__ + lvsr v6, 0, rSTR2 +#else + lvsl v6, 0, rSTR2 +#endif + /* There are 2 loops depending on the input alignment. + Each loop gets 16 bytes from s1 and s2, check for null, + convert to lowercase and compare. Loop till difference + or null occurs. */ +L(s1_align): + addi rSTR1, rSTR1, 16 + addi rSTR2, rSTR2, 16 +#ifdef USE_AS_STRNCASECMP + cmpdi cr7, r5, 16 + blt cr7, L(bytebybyte) + addi r5, r5, -16 +#endif + lvx v4, 0, rSTR1 + GET16BYTES(v5, rSTR2, v6) + CHECKNULLANDCONVERT() + blt cr6, L(s1_align) + b L(different) + .align 4 +L(aligned): + addi rSTR1, rSTR1, 16 + addi rSTR2, rSTR2, 16 +#ifdef USE_AS_STRNCASECMP + cmpdi cr7, r5, 16 + blt cr7, L(bytebybyte) + addi r5, r5, -16 +#endif + lvx v4, 0, rSTR1 + lvx v5, 0, rSTR2 + CHECKNULLANDCONVERT() + blt cr6, L(aligned) + + /* Calculate and return the difference. */ +L(different): + vaddubm v1, v3, v3 + vcmpequb v7, v0, v7 +#ifdef __LITTLE_ENDIAN__ + /* Count trailing zero. */ + vspltisb v8, -1 + VADDUQM_V7_V8 + vandc v8, v9, v7 + VPOPCNTD_V8_V8 + vspltb v6, v8, 15 + vcmpequb. v6, v6, v1 + blt cr6, L(shift8) +#else + /* Count leading zero. */ + VCLZD_V8_v7 + vspltb v6, v8, 7 + vcmpequb. v6, v6, v1 + blt cr6, L(shift8) + vsro v8, v8, v1 +#endif + b L(skipsum) + .align 4 +L(shift8): + vsumsws v8, v8, v0 +L(skipsum): +#ifdef __LITTLE_ENDIAN__ + /* Shift registers based on leading zero count. */ + vsro v6, v5, v8 + vsro v7, v4, v8 + /* Merge and move to GPR. */ + vmrglb v6, v6, v7 + vslo v1, v6, v1 + MFVRD_R3_V1 + /* Place the characters that are different in first position. */ + sldi rSTR2, rRTN, 56 + srdi rSTR2, rSTR2, 56 + sldi rSTR1, rRTN, 48 + srdi rSTR1, rSTR1, 56 +#else + vslo v6, v5, v8 + vslo v7, v4, v8 + vmrghb v1, v6, v7 + MFVRD_R3_V1 + srdi rSTR2, rRTN, 48 + sldi rSTR2, rSTR2, 56 + srdi rSTR2, rSTR2, 56 + srdi rSTR1, rRTN, 56 +#endif + subf rRTN, rSTR1, rSTR2 + extsw rRTN, rRTN + blr + + .align 4 + /* OK. We've hit the end of the string. We need to be careful that + we don't compare two strings as different because of junk beyond + the end of the strings... */ +L(null_found): + vaddubm v10, v3, v3 +#ifdef __LITTLE_ENDIAN__ + /* Count trailing zero. */ + vspltisb v8, -1 + VADDUQM_V7_V8 + vandc v8, v9, v7 + VPOPCNTD_V8_V8 + vspltb v6, v8, 15 + vcmpequb. v6, v6, v10 + blt cr6, L(shift_8) +#else + /* Count leading zero. */ + VCLZD_V8_v7 + vspltb v6, v8, 7 + vcmpequb. v6, v6, v10 + blt cr6, L(shift_8) + vsro v8, v8, v10 +#endif + b L(skipsum1) + .align 4 +L(shift_8): + vsumsws v8, v8, v0 +L(skipsum1): + /* Calculate shift count based on count of zero. */ + vspltisb v10, 7 + vslb v10, v10, v10 + vsldoi v9, v0, v10, 1 + VSUBUDM_V9_V8 + vspltisb v8, 8 + vsldoi v8, v0, v8, 1 + VSUBUDM_V9_V8 + /* Shift and remove junk after null character. */ +#ifdef __LITTLE_ENDIAN__ + vslo v5, v5, v9 + vslo v4, v4, v9 +#else + vsro v5, v5, v9 + vsro v4, v4, v9 +#endif + /* Convert and compare 16 bytes. */ + TOLOWER() + blt cr6, L(retnull) + b L(different) + .align 4 +L(retnull): + li rRTN, 0 + blr + .align 4 +L(bytebybyte): + /* Unrolling loop for POWER: loads are done with 'lbz' plus + offset and string descriptors are only updated in the end + of loop unrolling. */ + ld rLOC, LOCALE_CTYPE_TOLOWER(rLOC) + lbz rCHAR1, 0(rSTR1) /* Load char from s1 */ + lbz rCHAR2, 0(rSTR2) /* Load char from s2 */ +#ifdef USE_AS_STRNCASECMP + rldicl rTMP, r5, 62, 2 + cmpdi cr7, rTMP, 0 + beq cr7, L(lessthan4) + mtctr rTMP +#endif +L(loop): + cmpdi rCHAR1, 0 /* *s1 == '\0' ? */ + sldi rADDR1, rCHAR1, 2 /* Calculate address for tolower(*s1) */ + sldi rADDR2, rCHAR2, 2 /* Calculate address for tolower(*s2) */ + lwzx rLWR1, rLOC, rADDR1 /* Load tolower(*s1) */ + lwzx rLWR2, rLOC, rADDR2 /* Load tolower(*s2) */ + cmpw cr1, rLWR1, rLWR2 /* r = tolower(*s1) == tolower(*s2) ? */ + crorc 4*cr1+eq,eq,4*cr1+eq /* (*s1 != '\0') || (r == 1) */ + beq cr1, L(done) + lbz rCHAR1, 1(rSTR1) + lbz rCHAR2, 1(rSTR2) + cmpdi rCHAR1, 0 + sldi rADDR1, rCHAR1, 2 + sldi rADDR2, rCHAR2, 2 + lwzx rLWR1, rLOC, rADDR1 + lwzx rLWR2, rLOC, rADDR2 + cmpw cr1, rLWR1, rLWR2 + crorc 4*cr1+eq,eq,4*cr1+eq + beq cr1, L(done) + lbz rCHAR1, 2(rSTR1) + lbz rCHAR2, 2(rSTR2) + cmpdi rCHAR1, 0 + sldi rADDR1, rCHAR1, 2 + sldi rADDR2, rCHAR2, 2 + lwzx rLWR1, rLOC, rADDR1 + lwzx rLWR2, rLOC, rADDR2 + cmpw cr1, rLWR1, rLWR2 + crorc 4*cr1+eq,eq,4*cr1+eq + beq cr1, L(done) + lbz rCHAR1, 3(rSTR1) + lbz rCHAR2, 3(rSTR2) + cmpdi rCHAR1, 0 + /* Increment both string descriptors */ + addi rSTR1, rSTR1, 4 + addi rSTR2, rSTR2, 4 + sldi rADDR1, rCHAR1, 2 + sldi rADDR2, rCHAR2, 2 + lwzx rLWR1, rLOC, rADDR1 + lwzx rLWR2, rLOC, rADDR2 + cmpw cr1, rLWR1, rLWR2 + crorc 4*cr1+eq,eq,4*cr1+eq + beq cr1, L(done) + lbz rCHAR1, 0(rSTR1) /* Load char from s1 */ + lbz rCHAR2, 0(rSTR2) /* Load char from s2 */ +#ifdef USE_AS_STRNCASECMP + bdnz L(loop) +#else + b L(loop) +#endif +#ifdef USE_AS_STRNCASECMP +L(lessthan4): + clrldi r5, r5, 62 + cmpdi cr7, r5, 0 + beq cr7, L(retnull) + mtctr r5 +L(loop1): + cmpdi rCHAR1, 0 + sldi rADDR1, rCHAR1, 2 + sldi rADDR2, rCHAR2, 2 + lwzx rLWR1, rLOC, rADDR1 + lwzx rLWR2, rLOC, rADDR2 + cmpw cr1, rLWR1, rLWR2 + crorc 4*cr1+eq,eq,4*cr1+eq + beq cr1, L(done) + addi rSTR1, rSTR1, 1 + addi rSTR2, rSTR2, 1 + lbz rCHAR1, 0(rSTR1) + lbz rCHAR2, 0(rSTR2) + bdnz L(loop1) +#endif +L(done): + subf r0, rLWR2, rLWR1 + extsw rRTN, r0 + blr +END (__STRCASECMP) + +weak_alias (__STRCASECMP, STRCASECMP) +libc_hidden_builtin_def (__STRCASECMP) diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c new file mode 100644 index 0000000000..0e746b7718 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c @@ -0,0 +1,29 @@ +/* Optimized strcasestr implementation for PowerPC64/POWER8. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <string.h> + +#define STRCASESTR __strcasestr_ppc +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(__name) + +#undef weak_alias +#define weak_alias(a,b) +extern __typeof (strcasestr) __strcasestr_ppc attribute_hidden; + +#include <string/strcasestr.c> diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasestr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasestr.S new file mode 100644 index 0000000000..6ac6572f3b --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcasestr.S @@ -0,0 +1,538 @@ +/* Optimized strcasestr implementation for PowerPC64/POWER8. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <locale-defines.h> + +/* Char * [r3] strcasestr (char *s [r3], char * pat[r4]) */ + +/* The performance gain is obtained by comparing 16 bytes. */ + +/* When the first char of r4 is hit ITERATIONS times in r3 + fallback to default. */ +#define ITERATIONS 64 + +#ifndef STRCASESTR +# define STRCASESTR __strcasestr +#endif + +#ifndef STRLEN +/* For builds without IFUNC support, local calls should be made to internal + GLIBC symbol (created by libc_hidden_builtin_def). */ +# ifdef SHARED +# define STRLEN __GI_strlen +# else +# define STRLEN strlen +# endif +#endif + +#ifndef STRNLEN +/* For builds without IFUNC support, local calls should be made to internal + GLIBC symbol (created by libc_hidden_builtin_def). */ +# ifdef SHARED +# define STRNLEN __GI_strnlen +# else +# define STRNLEN __strnlen +# endif +#endif + +#ifndef STRCHR +# ifdef SHARED +# define STRCHR __GI_strchr +# else +# define STRCHR strchr +# endif +#endif + +/* Convert 16 bytes of v4 and reg to lowercase and compare. */ +#define TOLOWER(reg) \ + vcmpgtub v6, v4, v1; \ + vcmpgtub v7, v2, v4; \ + vand v8, v7, v6; \ + vand v8, v8, v3; \ + vor v4, v8, v4; \ + vcmpgtub v6, reg, v1; \ + vcmpgtub v7, v2, reg; \ + vand v8, v7, v6; \ + vand v8, v8, v3; \ + vor reg, v8, reg; \ + vcmpequb. v6, reg, v4; + +/* TODO: change these to the actual instructions when the minimum required + binutils allows it. */ +#ifdef _ARCH_PWR8 +#define VCLZD_V8_v7 vclzd v8, v7; +#else +#define VCLZD_V8_v7 .long 0x11003fc2 +#endif + +#define FRAMESIZE (FRAME_MIN_SIZE+48) +/* TODO: change this to .machine power8 when the minimum required binutils + allows it. */ + .machine power7 +EALIGN (STRCASESTR, 4, 0) + CALL_MCOUNT 2 + mflr r0 /* Load link register LR to r0. */ + std r31, -8(r1) /* Save callers register r31. */ + std r30, -16(r1) /* Save callers register r30. */ + std r29, -24(r1) /* Save callers register r29. */ + std r28, -32(r1) /* Save callers register r28. */ + std r27, -40(r1) /* Save callers register r27. */ + std r0, 16(r1) /* Store the link register. */ + cfi_offset(r31, -8) + cfi_offset(r30, -16) + cfi_offset(r29, -24) + cfi_offset(r28, -32) + cfi_offset(r27, -40) + cfi_offset(lr, 16) + stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */ + cfi_adjust_cfa_offset(FRAMESIZE) + + dcbt 0, r3 + dcbt 0, r4 + cmpdi cr7, r3, 0 /* Input validation. */ + beq cr7, L(retnull) + cmpdi cr7, r4, 0 + beq cr7, L(retnull) + + mr r29, r3 + mr r30, r4 + /* Load first byte from r4 and check if its null. */ + lbz r6, 0(r4) + cmpdi cr7, r6, 0 + beq cr7, L(ret_r3) + + ld r10, __libc_tsd_LOCALE@got@tprel(r2) + add r9, r10, __libc_tsd_LOCALE@tls + ld r9, 0(r9) + ld r9, LOCALE_CTYPE_TOUPPER(r9) + sldi r10, r6, 2 /* Convert to upper case. */ + lwzx r28, r9, r10 + + ld r10, __libc_tsd_LOCALE@got@tprel(r2) + add r11, r10, __libc_tsd_LOCALE@tls + ld r11, 0(r11) + ld r11, LOCALE_CTYPE_TOLOWER(r11) + sldi r10, r6, 2 /* Convert to lower case. */ + lwzx r27, r11, r10 + + /* Check if the first char is present. */ + mr r4, r27 + bl STRCHR + nop + mr r5, r3 + mr r3, r29 + mr r29, r5 + mr r4, r28 + bl STRCHR + nop + cmpdi cr7, r29, 0 + beq cr7, L(firstpos) + cmpdi cr7, r3, 0 + beq cr7, L(skipcheck) + cmpw cr7, r3, r29 + ble cr7, L(firstpos) + /* Move r3 to the first occurence. */ +L(skipcheck): + mr r3, r29 +L(firstpos): + mr r29, r3 + + sldi r9, r27, 8 + or r28, r9, r28 + /* Reg r27 is used to count the number of iterations. */ + li r27, 0 + /* If first char of search str is not present. */ + cmpdi cr7, r3, 0 + ble cr7, L(end) + + /* Find the length of pattern. */ + mr r3, r30 + bl STRLEN + nop + + cmpdi cr7, r3, 0 /* If search str is null. */ + beq cr7, L(ret_r3) + + mr r31, r3 + mr r4, r3 + mr r3, r29 + bl STRNLEN + nop + + cmpd cr7, r3, r31 /* If len(r3) < len(r4). */ + blt cr7, L(retnull) + + mr r3, r29 + + /* Locales not matching ASCII for single bytes. */ + ld r10, __libc_tsd_LOCALE@got@tprel(r2) + add r9, r10, __libc_tsd_LOCALE@tls + ld r9, 0(r9) + ld r7, 0(r9) + addi r7, r7, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES + lwz r8, 0(r7) + cmpdi cr7, r8, 1 + beq cr7, L(bytebybyte) + + /* If len(r4) < 16 handle byte by byte. */ + /* For shorter strings we will not use vector registers. */ + cmpdi cr7, r31, 16 + blt cr7, L(bytebybyte) + + /* Comparison values used for TOLOWER. */ + /* Load v1 = 64('A' - 1), v2 = 91('Z' + 1), v3 = 32 in each byte. */ + vspltish v0, 0 + vspltisb v5, 2 + vspltisb v4, 4 + vsl v3, v5, v4 + vaddubm v1, v3, v3 + vspltisb v5, 15 + vaddubm v2, v5, v5 + vaddubm v2, v1, v2 + vspltisb v4, -3 + vaddubm v2, v2, v4 + + /* + 1. Load 16 bytes from r3 and r4 + 2. Check if there is null, If yes, proceed byte by byte path. + 3. Else,Convert both to lowercase and compare. + 4. If they are same proceed to 1. + 5. If they dont match, find if first char of r4 is present in the + loaded 16 byte of r3. + 6. If yes, move position, load next 16 bytes of r3 and proceed to 2. + */ + + mr r8, r3 /* Save r3 for future use. */ + mr r4, r30 /* Restore r4. */ + clrldi r10, r4, 60 + lvx v5, 0, r4 /* Load 16 bytes from r4. */ + cmpdi cr7, r10, 0 + beq cr7, L(begin2) + /* If r4 is unaligned, load another 16 bytes. */ +#ifdef __LITTLE_ENDIAN__ + lvsr v7, 0, r4 +#else + lvsl v7, 0, r4 +#endif + addi r5, r4, 16 + lvx v9, 0, r5 +#ifdef __LITTLE_ENDIAN__ + vperm v5, v9, v5, v7 +#else + vperm v5, v5, v9, v7 +#endif +L(begin2): + lvx v4, 0, r3 + vcmpequb. v7, v0, v4 /* Check for null. */ + beq cr6, L(nullchk6) + b L(trailcheck) + + .align 4 +L(nullchk6): + clrldi r10, r3, 60 + cmpdi cr7, r10, 0 + beq cr7, L(next16) +#ifdef __LITTLE_ENDIAN__ + lvsr v7, 0, r3 +#else + lvsl v7, 0, r3 +#endif + addi r5, r3, 16 + /* If r3 is unaligned, load another 16 bytes. */ + lvx v10, 0, r5 +#ifdef __LITTLE_ENDIAN__ + vperm v4, v10, v4, v7 +#else + vperm v4, v4, v10, v7 +#endif +L(next16): + vcmpequb. v6, v0, v5 /* Check for null. */ + beq cr6, L(nullchk) + b L(trailcheck) + + .align 4 +L(nullchk): + vcmpequb. v6, v0, v4 + beq cr6, L(nullchk1) + b L(retnull) + + .align 4 +L(nullchk1): + /* Convert both v3 and v4 to lower. */ + TOLOWER(v5) + /* If both are same, branch to match. */ + blt cr6, L(match) + /* Find if the first char is present in next 15 bytes. */ +#ifdef __LITTLE_ENDIAN__ + vspltb v6, v5, 15 + vsldoi v7, v0, v4, 15 +#else + vspltb v6, v5, 0 + vspltisb v7, 8 + vslo v7, v4, v7 +#endif + vcmpequb v7, v6, v7 + vcmpequb. v6, v0, v7 + /* Shift r3 by 16 bytes and proceed. */ + blt cr6, L(shift16) + VCLZD_V8_v7 +#ifdef __LITTLE_ENDIAN__ + vspltb v6, v8, 15 +#else + vspltb v6, v8, 7 +#endif + vcmpequb. v6, v6, v1 + /* Shift r3 by 8 bytes and proceed. */ + blt cr6, L(shift8) + b L(begin) + + .align 4 +L(match): + /* There is a match of 16 bytes, check next bytes. */ + cmpdi cr7, r31, 16 + mr r29, r3 + beq cr7, L(ret_r3) + +L(secondmatch): + addi r3, r3, 16 + addi r4, r4, 16 + /* Load next 16 bytes of r3 and r4 and compare. */ + clrldi r10, r4, 60 + cmpdi cr7, r10, 0 + beq cr7, L(nextload) + /* Handle unaligned case. */ + vor v6, v9, v9 + vcmpequb. v7, v0, v6 + beq cr6, L(nullchk2) + b L(trailcheck) + + .align 4 +L(nullchk2): +#ifdef __LITTLE_ENDIAN__ + lvsr v7, 0, r4 +#else + lvsl v7, 0, r4 +#endif + addi r5, r4, 16 + /* If r4 is unaligned, load another 16 bytes. */ + lvx v9, 0, r5 +#ifdef __LITTLE_ENDIAN__ + vperm v11, v9, v6, v7 +#else + vperm v11, v6, v9, v7 +#endif + b L(compare) + + .align 4 +L(nextload): + lvx v11, 0, r4 +L(compare): + vcmpequb. v7, v0, v11 + beq cr6, L(nullchk3) + b L(trailcheck) + + .align 4 +L(nullchk3): + clrldi r10, r3, 60 + cmpdi cr7, r10, 0 + beq cr7, L(nextload1) + /* Handle unaligned case. */ + vor v4, v10, v10 + vcmpequb. v7, v0, v4 + beq cr6, L(nullchk4) + b L(retnull) + + .align 4 +L(nullchk4): +#ifdef __LITTLE_ENDIAN__ + lvsr v7, 0, r3 +#else + lvsl v7, 0, r3 +#endif + addi r5, r3, 16 + /* If r3 is unaligned, load another 16 bytes. */ + lvx v10, 0, r5 +#ifdef __LITTLE_ENDIAN__ + vperm v4, v10, v4, v7 +#else + vperm v4, v4, v10, v7 +#endif + b L(compare1) + + .align 4 +L(nextload1): + lvx v4, 0, r3 +L(compare1): + vcmpequb. v7, v0, v4 + beq cr6, L(nullchk5) + b L(retnull) + + .align 4 +L(nullchk5): + /* Convert both v3 and v4 to lower. */ + TOLOWER(v11) + /* If both are same, branch to secondmatch. */ + blt cr6, L(secondmatch) + /* Continue the search. */ + b L(begin) + + .align 4 +L(trailcheck): + ld r10, __libc_tsd_LOCALE@got@tprel(r2) + add r11, r10, __libc_tsd_LOCALE@tls + ld r11, 0(r11) + ld r11, LOCALE_CTYPE_TOLOWER(r11) +L(loop2): + lbz r5, 0(r3) /* Load byte from r3. */ + lbz r6, 0(r4) /* Load next byte from r4. */ + cmpdi cr7, r6, 0 /* Is it null? */ + beq cr7, L(updater3) + cmpdi cr7, r5, 0 /* Is it null? */ + beq cr7, L(retnull) /* If yes, return. */ + addi r3, r3, 1 + addi r4, r4, 1 /* Increment r4. */ + sldi r10, r5, 2 /* Convert to lower case. */ + lwzx r10, r11, r10 + sldi r7, r6, 2 /* Convert to lower case. */ + lwzx r7, r11, r7 + cmpw cr7, r7, r10 /* Compare with byte from r4. */ + bne cr7, L(begin) + b L(loop2) + + .align 4 +L(shift8): + addi r8, r8, 7 + b L(begin) + .align 4 +L(shift16): + addi r8, r8, 15 + .align 4 +L(begin): + addi r8, r8, 1 + mr r3, r8 + /* When our iterations exceed ITERATIONS,fall back to default. */ + addi r27, r27, 1 + cmpdi cr7, r27, ITERATIONS + beq cr7, L(default) + mr r4, r30 /* Restore r4. */ + b L(begin2) + + /* Handling byte by byte. */ + .align 4 +L(loop1): + mr r3, r8 + addi r27, r27, 1 + cmpdi cr7, r27, ITERATIONS + beq cr7, L(default) + mr r29, r8 + srdi r4, r28, 8 + /* Check if the first char is present. */ + bl STRCHR + nop + mr r5, r3 + mr r3, r29 + mr r29, r5 + sldi r4, r28, 56 + srdi r4, r4, 56 + bl STRCHR + nop + cmpdi cr7, r29, 0 + beq cr7, L(nextpos) + cmpdi cr7, r3, 0 + beq cr7, L(skipcheck1) + cmpw cr7, r3, r29 + ble cr7, L(nextpos) + /* Move r3 to first occurence. */ +L(skipcheck1): + mr r3, r29 +L(nextpos): + mr r29, r3 + cmpdi cr7, r3, 0 + ble cr7, L(retnull) +L(bytebybyte): + ld r10, __libc_tsd_LOCALE@got@tprel(r2) + add r11, r10, __libc_tsd_LOCALE@tls + ld r11, 0(r11) + ld r11, LOCALE_CTYPE_TOLOWER(r11) + mr r4, r30 /* Restore r4. */ + mr r8, r3 /* Save r3. */ + addi r8, r8, 1 + +L(loop): + addi r3, r3, 1 + lbz r5, 0(r3) /* Load byte from r3. */ + addi r4, r4, 1 /* Increment r4. */ + lbz r6, 0(r4) /* Load next byte from r4. */ + cmpdi cr7, r6, 0 /* Is it null? */ + beq cr7, L(updater3) + cmpdi cr7, r5, 0 /* Is it null? */ + beq cr7, L(retnull) /* If yes, return. */ + sldi r10, r5, 2 /* Convert to lower case. */ + lwzx r10, r11, r10 + sldi r7, r6, 2 /* Convert to lower case. */ + lwzx r7, r11, r7 + cmpw cr7, r7, r10 /* Compare with byte from r4. */ + bne cr7, L(loop1) + b L(loop) + + /* Handling return values. */ + .align 4 +L(updater3): + subf r3, r31, r3 /* Reduce r31 (len of r4) from r3. */ + b L(end) + + .align 4 +L(ret_r3): + mr r3, r29 /* Return point of match. */ + b L(end) + + .align 4 +L(retnull): + li r3, 0 /* Substring was not found. */ + b L(end) + + .align 4 +L(default): + mr r4, r30 + bl __strcasestr_ppc + nop + + .align 4 +L(end): + addi r1, r1, FRAMESIZE /* Restore stack pointer. */ + cfi_adjust_cfa_offset(-FRAMESIZE) + ld r0, 16(r1) /* Restore the saved link register. */ + ld r27, -40(r1) + ld r28, -32(r1) + ld r29, -24(r1) /* Restore callers save register r29. */ + ld r30, -16(r1) /* Restore callers save register r30. */ + ld r31, -8(r1) /* Restore callers save register r31. */ + cfi_restore(lr) + cfi_restore(r27) + cfi_restore(r28) + cfi_restore(r29) + cfi_restore(r30) + cfi_restore(r31) + mtlr r0 /* Branch to link register. */ + blr +END (STRCASESTR) + +weak_alias (__strcasestr, strcasestr) +libc_hidden_def (__strcasestr) +libc_hidden_builtin_def (strcasestr) diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strchr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strchr.S new file mode 100644 index 0000000000..e0c185c162 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strchr.S @@ -0,0 +1,377 @@ +/* Optimized strchr implementation for PowerPC64/POWER8. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#ifdef USE_AS_STRCHRNUL +# ifndef STRCHRNUL +# define FUNC_NAME __strchrnul +# else +# define FUNC_NAME STRCHRNUL +# endif +#else +# ifndef STRCHR +# define FUNC_NAME strchr +# else +# define FUNC_NAME STRCHR +# endif +#endif /* !USE_AS_STRCHRNUL */ + +/* int [r3] strchr (char *s [r3], int c [r4]) */ +/* TODO: change these to the actual instructions when the minimum required + binutils allows it. */ +#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16))) +#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) +#define VBPERMQ(t,a,b) .long (0x1000054c \ + | ((t)<<(32-11)) \ + | ((a)<<(32-16)) \ + | ((b)<<(32-21)) ) +/* TODO: change this to .machine power8 when the minimum required binutils + allows it. */ + .machine power7 +ENTRY (FUNC_NAME) + CALL_MCOUNT 2 + dcbt 0,r3 + clrrdi r8,r3,3 /* Align the address to doubleword boundary. */ + cmpdi cr7,r4,0 + ld r12,0(r8) /* Load doubleword from memory. */ + li r0,0 /* Doubleword with null chars to use + with cmpb. */ + + rlwinm r6,r3,3,26,28 /* Calculate padding. */ + + beq cr7,L(null_match) + + /* Replicate byte to doubleword. */ + insrdi r4,r4,8,48 + insrdi r4,r4,16,32 + insrdi r4,r4,32,0 + + /* Now r4 has a doubleword of c bytes and r0 has + a doubleword of null bytes. */ + + cmpb r10,r12,r4 /* Compare each byte against c byte. */ + cmpb r11,r12,r0 /* Compare each byte against null byte. */ + + /* Move the doublewords left and right to discard the bits that are + not part of the string and bring them back as zeros. */ +#ifdef __LITTLE_ENDIAN__ + srd r10,r10,r6 + srd r11,r11,r6 + sld r10,r10,r6 + sld r11,r11,r6 +#else + sld r10,r10,r6 + sld r11,r11,r6 + srd r10,r10,r6 + srd r11,r11,r6 +#endif + or r5,r10,r11 /* OR the results to speed things up. */ + cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes + have been found. */ + bne cr7,L(done) + + mtcrf 0x01,r8 + + /* Are we now aligned to a doubleword boundary? If so, skip to + the main loop. Otherwise, go through the alignment code. */ + + bt 28,L(loop) + + /* Handle WORD2 of pair. */ + ldu r12,8(r8) + cmpb r10,r12,r4 + cmpb r11,r12,r0 + or r5,r10,r11 + cmpdi cr7,r5,0 + bne cr7,L(done) + b L(loop) /* We branch here (rather than falling through) + to skip the nops due to heavy alignment + of the loop below. */ + + .p2align 5 +L(loop): + /* Load two doublewords, compare and merge in a + single register for speed. This is an attempt + to speed up the null-checking process for bigger strings. */ + ld r12,8(r8) + ldu r9,16(r8) + cmpb r10,r12,r4 + cmpb r11,r12,r0 + cmpb r6,r9,r4 + cmpb r7,r9,r0 + or r5,r10,r11 + or r9,r6,r7 + or r12,r5,r9 + cmpdi cr7,r12,0 + beq cr7,L(vector) + /* OK, one (or both) of the doublewords contains a c/null byte. Check + the first doubleword and decrement the address in case the first + doubleword really contains a c/null byte. */ + + cmpdi cr6,r5,0 + addi r8,r8,-8 + bne cr6,L(done) + + /* The c/null byte must be in the second doubleword. Adjust the + address again and move the result of cmpb to r10 so we can calculate + the pointer. */ + + mr r10,r6 + mr r11,r7 + addi r8,r8,8 +#ifdef USE_AS_STRCHRNUL + mr r5, r9 +#endif + /* r10/r11 have the output of the cmpb instructions, that is, + 0xff in the same position as the c/null byte in the original + doubleword from the string. Use that to calculate the pointer. */ +L(done): +#ifdef USE_AS_STRCHRNUL + mr r10, r5 +#endif +#ifdef __LITTLE_ENDIAN__ + addi r3,r10,-1 + andc r3,r3,r10 + popcntd r0,r3 +# ifndef USE_AS_STRCHRNUL + addi r4,r11,-1 + andc r4,r4,r11 + cmpld cr7,r3,r4 + bgt cr7,L(no_match) +# endif +#else + cntlzd r0,r10 /* Count leading zeros before c matches. */ +# ifndef USE_AS_STRCHRNUL + cmpld cr7,r11,r10 + bgt cr7,L(no_match) +# endif +#endif + srdi r0,r0,3 /* Convert leading zeros to bytes. */ + add r3,r8,r0 /* Return address of the matching c byte + or null in case c was not found. */ + blr + + /* Check the first 32B in GPR's and move to vectorized loop. */ + .p2align 5 +L(vector): + addi r3, r8, 8 + andi. r10, r3, 31 + bne cr0, L(loop) + vspltisb v0, 0 + /* Precompute vbpermq constant. */ + vspltisb v10, 3 + lvsl v11, r0, r0 + vslb v10, v11, v10 + MTVRD(v1,r4) + li r5, 16 + vspltb v1, v1, 7 + /* Compare 32 bytes in each loop. */ +L(continue): + lvx v4, 0, r3 + lvx v5, r3, r5 + vcmpequb v2, v0, v4 + vcmpequb v3, v0, v5 + vcmpequb v6, v1, v4 + vcmpequb v7, v1, v5 + vor v8, v2, v3 + vor v9, v6, v7 + vor v11, v8, v9 + vcmpequb. v11, v0, v11 + addi r3, r3, 32 + blt cr6, L(continue) + /* One (or both) of the quadwords contains a c/null byte. */ + addi r3, r3, -32 +#ifndef USE_AS_STRCHRNUL + vcmpequb. v11, v0, v9 + blt cr6, L(no_match) +#endif + /* Permute the first bit of each byte into bits 48-63. */ + VBPERMQ(v2, v2, v10) + VBPERMQ(v3, v3, v10) + VBPERMQ(v6, v6, v10) + VBPERMQ(v7, v7, v10) + /* Shift each component into its correct position for merging. */ +#ifdef __LITTLE_ENDIAN__ + vsldoi v3, v3, v3, 2 + vsldoi v7, v7, v7, 2 +#else + vsldoi v2, v2, v2, 6 + vsldoi v3, v3, v3, 4 + vsldoi v6, v6, v6, 6 + vsldoi v7, v7, v7, 4 +#endif + + /* Merge the results and move to a GPR. */ + vor v1, v3, v2 + vor v2, v6, v7 + vor v4, v1, v2 + MFVRD(r5, v4) +#ifdef __LITTLE_ENDIAN__ + addi r6, r5, -1 + andc r6, r6, r5 + popcntd r6, r6 +#else + cntlzd r6, r5 /* Count leading zeros before the match. */ +#endif + add r3, r3, r6 /* Compute final length. */ + /* Return NULL if null found before c. */ +#ifndef USE_AS_STRCHRNUL + lbz r4, 0(r3) + cmpdi cr7, r4, 0 + beq cr7, L(no_match) +#endif + blr + +#ifndef USE_AS_STRCHRNUL + .align 4 +L(no_match): + li r3,0 + blr +#endif + +/* We are here because strchr was called with a null byte. */ + .align 4 +L(null_match): + /* r0 has a doubleword of null bytes. */ + + cmpb r5,r12,r0 /* Compare each byte against null bytes. */ + + /* Move the doublewords left and right to discard the bits that are + not part of the string and bring them back as zeros. */ +#ifdef __LITTLE_ENDIAN__ + srd r5,r5,r6 + sld r5,r5,r6 +#else + sld r5,r5,r6 + srd r5,r5,r6 +#endif + cmpdi cr7,r5,0 /* If r10 == 0, no c or null bytes + have been found. */ + bne cr7,L(done_null) + + mtcrf 0x01,r8 + + /* Are we now aligned to a quadword boundary? If so, skip to + the main loop. Otherwise, go through the alignment code. */ + + bt 28,L(loop_null) + + /* Handle WORD2 of pair. */ + ldu r12,8(r8) + cmpb r5,r12,r0 + cmpdi cr7,r5,0 + bne cr7,L(done_null) + b L(loop_null) /* We branch here (rather than falling through) + to skip the nops due to heavy alignment + of the loop below. */ + + /* Main loop to look for the end of the string. Since it's a + small loop (< 8 instructions), align it to 32-bytes. */ + .p2align 5 +L(loop_null): + /* Load two doublewords, compare and merge in a + single register for speed. This is an attempt + to speed up the null-checking process for bigger strings. */ + ld r12,8(r8) + ldu r11,16(r8) + cmpb r5,r12,r0 + cmpb r10,r11,r0 + or r6,r5,r10 + cmpdi cr7,r6,0 + beq cr7,L(vector1) + + /* OK, one (or both) of the doublewords contains a null byte. Check + the first doubleword and decrement the address in case the first + doubleword really contains a null byte. */ + + cmpdi cr6,r5,0 + addi r8,r8,-8 + bne cr6,L(done_null) + + /* The null byte must be in the second doubleword. Adjust the address + again and move the result of cmpb to r10 so we can calculate the + pointer. */ + + mr r5,r10 + addi r8,r8,8 + + /* r5 has the output of the cmpb instruction, that is, it contains + 0xff in the same position as the null byte in the original + doubleword from the string. Use that to calculate the pointer. */ +L(done_null): +#ifdef __LITTLE_ENDIAN__ + addi r0,r5,-1 + andc r0,r0,r5 + popcntd r0,r0 +#else + cntlzd r0,r5 /* Count leading zeros before the match. */ +#endif + srdi r0,r0,3 /* Convert leading zeros to bytes. */ + add r3,r8,r0 /* Return address of the matching null byte. */ + blr + .p2align 5 +L(vector1): + addi r3, r8, 8 + andi. r10, r3, 31 + bne cr0, L(loop_null) + vspltisb v8, -1 + vspltisb v0, 0 + vspltisb v10, 3 + lvsl v11, r0, r0 + vslb v10, v11, v10 + li r5, 16 +L(continue1): + lvx v4, 0, r3 + lvx v5, r3, r5 + vcmpequb v2, v0, v4 + vcmpequb v3, v0, v5 + vor v8, v2, v3 + vcmpequb. v11, v0, v8 + addi r3, r3, 32 + blt cr6, L(continue1) + addi r3, r3, -32 +L(end1): + VBPERMQ(v2, v2, v10) + VBPERMQ(v3, v3, v10) + /* Shift each component into its correct position for merging. */ +#ifdef __LITTLE_ENDIAN__ + vsldoi v3, v3, v3, 2 +#else + vsldoi v2, v2, v2, 6 + vsldoi v3, v3, v3, 4 +#endif + + /* Merge the results and move to a GPR. */ + vor v4, v3, v2 + MFVRD(r5, v4) +#ifdef __LITTLE_ENDIAN__ + addi r6, r5, -1 + andc r6, r6, r5 + popcntd r6, r6 +#else + cntlzd r6, r5 /* Count leading zeros before the match. */ +#endif + add r3, r3, r6 /* Compute final length. */ + blr +END (FUNC_NAME) + +#ifndef USE_AS_STRCHRNUL +weak_alias (strchr, index) +libc_hidden_builtin_def (strchr) +#endif diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strchrnul.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strchrnul.S new file mode 100644 index 0000000000..3bf4b275dd --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strchrnul.S @@ -0,0 +1,23 @@ +/* Optimized strchrnul implementation for PowerPC64/POWER8. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define USE_AS_STRCHRNUL 1 +#include <sysdeps/powerpc/powerpc64/power8/strchr.S> + +weak_alias (__strchrnul,strchrnul) +libc_hidden_builtin_def (__strchrnul) diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcmp.S new file mode 100644 index 0000000000..770484f1e1 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcmp.S @@ -0,0 +1,247 @@ +/* Optimized strcmp implementation for PowerPC64/POWER8. + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#ifndef STRCMP +# define STRCMP strcmp +#endif + +/* Implements the function + + size_t [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) + + The implementation uses unaligned doubleword access to avoid specialized + code paths depending of data alignment. Although recent powerpc64 uses + 64K as default, the page cross handling assumes minimum page size of + 4k. */ + +EALIGN (STRCMP, 4, 0) + li r0,0 + + /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using + the code: + + (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE)) + + with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */ + + rldicl r7,r3,0,52 + rldicl r9,r4,0,52 + cmpldi cr7,r7,4096-16 + bgt cr7,L(pagecross_check) + cmpldi cr5,r9,4096-16 + bgt cr5,L(pagecross_check) + + /* For short string up to 16 bytes, load both s1 and s2 using + unaligned dwords and compare. */ + ld r8,0(r3) + ld r10,0(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + ld r8,8(r3) + ld r10,8(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + addi r7,r3,16 + addi r4,r4,16 + +L(align_8b): + /* Now it has checked for first 16 bytes, align source1 to doubleword + and adjust source2 address. */ + rldicl r9,r7,0,61 /* source1 alignment to doubleword */ + subf r4,r9,r4 /* Adjust source2 address based on source1 + alignment. */ + rldicr r7,r7,0,60 /* Align source1 to doubleword. */ + + /* At this point, source1 alignment is 0 and source2 alignment is + between 0 and 7. Check is source2 alignment is 0, meaning both + sources have the same alignment. */ + andi. r9,r4,0x7 + bne cr0,L(loop_diff_align) + + /* If both source1 and source2 are doubleword aligned, there is no + need for page boundary cross checks. */ + + ld r8,0(r7) + ld r10,0(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + .align 4 +L(loop_equal_align): + ld r8,8(r7) + ld r10,8(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + ld r8,16(r7) + ld r10,16(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + ldu r8,24(r7) + ldu r10,24(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + b L(loop_equal_align) + + /* A zero byte was found in r8 (s1 dword), r9 contains the cmpb + result and r10 the dword from s2. To code isolate the byte + up to end (including the '\0'), masking with 0xFF the remaining + ones: + + #if __LITTLE_ENDIAN__ + (__builtin_ffsl (x) - 1) = counting trailing zero bits + r9 = (__builtin_ffsl (r9) - 1) + 8; + r9 = -1UL << r9 + #else + r9 = __builtin_clzl (r9) + 8; + r9 = -1UL >> r9 + #endif + r8 = r8 | r9 + r10 = r10 | r9 */ + +#ifdef __LITTLE_ENDIAN__ + nor r9,r9,r9 +L(different_nocmpb): + neg r3,r9 + and r9,r9,r3 + cntlzd r9,r9 + subfic r9,r9,63 +#else + not r9,r9 +L(different_nocmpb): + cntlzd r9,r9 + subfic r9,r9,56 +#endif + srd r3,r8,r9 + srd r10,r10,r9 + rldicl r10,r10,0,56 + rldicl r3,r3,0,56 + subf r3,r10,r3 + extsw r3,r3 + blr + + .align 4 +L(pagecross_check): + subfic r9,r9,4096 + subfic r7,r7,4096 + cmpld cr7,r7,r9 + bge cr7,L(pagecross) + mr r7,r9 + + /* If unaligned 16 bytes reads across a 4K page boundary, it uses + a simple byte a byte comparison until the page alignment for s1 + is reached. */ +L(pagecross): + add r7,r3,r7 + subf r9,r3,r7 + mtctr r9 + + .align 4 +L(pagecross_loop): + /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2 + and if *s1 is '\0'. */ + lbz r9,0(r3) + lbz r10,0(r4) + addi r3,r3,1 + addi r4,r4,1 + cmplw cr7,r9,r10 + cmpdi cr5,r9,r0 + bne cr7,L(pagecross_ne) + beq cr5,L(pagecross_nullfound) + bdnz L(pagecross_loop) + b L(align_8b) + + .align 4 + /* The unaligned read of source2 will cross a 4K page boundary, + and the different byte or NULL maybe be in the remaining page + bytes. Since it can not use the unaligned load, the algorithm + reads and compares 8 bytes to keep source1 doubleword aligned. */ +L(check_source2_byte): + li r9,8 + mtctr r9 + + .align 4 +L(check_source2_byte_loop): + lbz r9,0(r7) + lbz r10,0(r4) + addi r7,r7,1 + addi r4,r4,1 + cmplw cr7,r9,10 + cmpdi r5,r9,0 + bne cr7,L(pagecross_ne) + beq cr5,L(pagecross_nullfound) + bdnz L(check_source2_byte_loop) + + /* If source2 is unaligned to doubleword, the code needs to check + on each interation if the unaligned doubleword access will cross + a 4k page boundary. */ + .align 5 +L(loop_unaligned): + ld r8,0(r7) + ld r10,0(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + addi r7,r7,8 + addi r4,r4,8 + +L(loop_diff_align): + /* Check if [src2]+8 cross a 4k page boundary: + + srcin2 % PAGE_SIZE > (PAGE_SIZE - 8) + + with PAGE_SIZE being 4096. */ + rldicl r9,r4,0,52 + cmpldi cr7,r9,4088 + ble cr7,L(loop_unaligned) + b L(check_source2_byte) + + .align 4 +L(pagecross_ne): + extsw r3,r9 + mr r9,r10 +L(pagecross_retdiff): + subf r9,r9,r3 + extsw r3,r9 + blr + + .align 4 +L(pagecross_nullfound): + li r3,0 + b L(pagecross_retdiff) +END (STRCMP) +libc_hidden_builtin_def (strcmp) diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcpy.S new file mode 100644 index 0000000000..7f2cee4b1b --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcpy.S @@ -0,0 +1,270 @@ +/* Optimized strcpy/stpcpy implementation for PowerPC64/POWER8. + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#ifdef USE_AS_STPCPY +# ifndef STPCPY +# define FUNC_NAME __stpcpy +# else +# define FUNC_NAME STPCPY +# endif +#else +# ifndef STRCPY +# define FUNC_NAME strcpy +# else +# define FUNC_NAME STRCPY +# endif +#endif /* !USE_AS_STPCPY */ + +/* Implements the function + + char * [r3] strcpy (char *dest [r3], const char *src [r4]) + + or + + char * [r3] stpcpy (char *dest [r3], const char *src [r4]) + + if USE_AS_STPCPY is defined. + + The implementation uses unaligned doubleword access to avoid specialized + code paths depending of data alignment. Although recent powerpc64 uses + 64K as default, the page cross handling assumes minimum page size of + 4k. */ + + .machine power7 +EALIGN (FUNC_NAME, 4, 0) + li r0,0 /* Doubleword with null chars to use + with cmpb. */ + + /* Check if the [src]+15 will cross a 4K page by checking if the bit + indicating the page size changes. Basically: + + uint64_t srcin = (uint64_t)src; + uint64_t ob = srcin & 4096UL; + uint64_t nb = (srcin+15UL) & 4096UL; + if (ob ^ nb) + goto pagecross; */ + + addi r9,r4,15 + xor r9,r9,r4 + rlwinm. r9,r9,0,19,19 + bne L(pagecross) + + /* For short string (less than 16 bytes), just calculate its size as + strlen and issues a memcpy if null is found. */ + mr r7,r4 + ld r12,0(r7) /* Load doubleword from memory. */ + cmpb r10,r12,r0 /* Check for null bytes in DWORD1. */ + cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ + bne cr7,L(done) + + ldu r8,8(r7) + cmpb r10,r8,r0 + cmpdi cr7,r10,0 + bne cr7,L(done) + + b L(loop_before) + + .align 4 +L(pagecross): + clrrdi r7,r4,3 /* Align the address to doubleword boundary. */ + rlwinm r6,r4,3,26,28 /* Calculate padding. */ + li r5,-1 /* MASK = 0xffffffffffffffff. */ + ld r12,0(r7) /* Load doubleword from memory. */ +#ifdef __LITTLE_ENDIAN__ + sld r5,r5,r6 +#else + srd r5,r5,r6 /* MASK = MASK >> padding. */ +#endif + orc r9,r12,r5 /* Mask bits that are not part of the string. */ + cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */ + cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ + bne cr7,L(done) + + ldu r6,8(r7) + cmpb r10,r6,r0 + cmpdi cr7,r10,0 + bne cr7,L(done) + + ld r12,0(r7) + cmpb r10,r12,r0 + cmpdi cr7,r10,0 + bne cr7,L(done) + + ldu r6,8(r7) + cmpb r10,r6,r0 + cmpdi cr7,r10,0 + bne cr7,L(done) + + /* We checked for 24 - x bytes, with x being the source alignment + (0 <= x <= 16), and no zero has been found. Start the loop + copy with doubleword aligned address. */ + mr r7,r4 + ld r12, 0(r7) + ldu r8, 8(r7) + +L(loop_before): + /* Save the two doublewords readed from source and align the source + to 16 bytes for the loop. */ + mr r11,r3 + std r12,0(r11) + std r8,8(r11) + addi r11,r11,16 + rldicl r9,r4,0,60 + subf r7,r9,r7 + subf r11,r9,r11 + b L(loop_start) + + .align 5 +L(loop): + std r12, 0(r11) + std r6, 8(r11) + addi r11,r11,16 +L(loop_start): + /* Load two doublewords, compare and merge in a + single register for speed. This is an attempt + to speed up the null-checking process for bigger strings. */ + + ld r12, 8(r7) + ldu r6, 16(r7) + cmpb r10,r12,r0 + cmpb r9,r6,r0 + or r8,r9,r10 /* Merge everything in one doubleword. */ + cmpdi cr7,r8,0 + beq cr7,L(loop) + + + /* OK, one (or both) of the doublewords contains a null byte. Check + the first doubleword and decrement the address in case the first + doubleword really contains a null byte. */ + + addi r4,r7,-8 + cmpdi cr6,r10,0 + addi r7,r7,-8 + bne cr6,L(done2) + + /* The null byte must be in the second doubleword. Adjust the address + again and move the result of cmpb to r10 so we can calculate the + length. */ + + mr r10,r9 + addi r7,r7,8 + b L(done2) + + /* r10 has the output of the cmpb instruction, that is, it contains + 0xff in the same position as the null byte in the original + doubleword from the string. Use that to calculate the length. */ +L(done): + mr r11,r3 +L(done2): +#ifdef __LITTLE_ENDIAN__ + addi r9, r10, -1 /* Form a mask from trailing zeros. */ + andc r9, r9, r10 + popcntd r6, r9 /* Count the bits in the mask. */ +#else + cntlzd r6,r10 /* Count leading zeros before the match. */ +#endif + subf r5,r4,r7 + srdi r6,r6,3 /* Convert leading/trailing zeros to bytes. */ + add r8,r5,r6 /* Compute final length. */ +#ifdef USE_AS_STPCPY + /* stpcpy returns the dest address plus the size not counting the + final '\0'. */ + add r3,r11,r8 +#endif + addi r8,r8,1 /* Final '/0'. */ + + cmpldi cr6,r8,8 + mtocrf 0x01,r8 + ble cr6,L(copy_LE_8) + + cmpldi cr1,r8,16 + blt cr1,8f + + /* Handle copies of 0~31 bytes. */ + .align 4 +L(copy_LT_32): + /* At least 6 bytes to go. */ + blt cr1,8f + + /* Copy 16 bytes. */ + ld r6,0(r4) + ld r8,8(r4) + addi r4,r4,16 + std r6,0(r11) + std r8,8(r11) + addi r11,r11,16 +8: /* Copy 8 bytes. */ + bf 28,L(tail4) + ld r6,0(r4) + addi r4,r4,8 + std r6,0(r11) + addi r11,r11,8 + + .align 4 +/* Copies 4~7 bytes. */ +L(tail4): + bf 29,L(tail2) + lwz r6,0(r4) + stw r6,0(r11) + bf 30,L(tail5) + lhz r7,4(r4) + sth r7,4(r11) + bflr 31 + lbz r8,6(r4) + stb r8,6(r11) + blr + + .align 4 +/* Copies 2~3 bytes. */ +L(tail2): + bf 30,1f + lhz r6,0(r4) + sth r6,0(r11) + bflr 31 + lbz r7,2(r4) + stb r7,2(r11) + blr + + .align 4 +L(tail5): + bf 31,1f + lbz r6,4(r4) + stb r6,4(r11) + blr + + .align 4 +1: + bflr 31 + lbz r6,0(r4) + stb r6,0(r11) + blr + +/* Handles copies of 0~8 bytes. */ + .align 4 +L(copy_LE_8): + bne cr6,L(tail4) + ld r6,0(r4) + std r6,0(r11) + blr +END (FUNC_NAME) + +#ifndef USE_AS_STPCPY +libc_hidden_builtin_def (strcpy) +#endif diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcspn.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcspn.S new file mode 100644 index 0000000000..c9a7a2e3c3 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strcspn.S @@ -0,0 +1,20 @@ +/* Optimized strcspn implementation for PowerPC64/POWER8. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define USE_AS_STRCSPN 1 +#include <sysdeps/powerpc/powerpc64/power8/strspn.S> diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strlen.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strlen.S new file mode 100644 index 0000000000..8f4a1fc1dc --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strlen.S @@ -0,0 +1,301 @@ +/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized + loop. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* TODO: change these to the actual instructions when the minimum required + binutils allows it. */ +#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) +#define VBPERMQ(t,a,b) .long (0x1000054c \ + | ((t)<<(32-11)) \ + | ((a)<<(32-16)) \ + | ((b)<<(32-21)) ) + +/* int [r3] strlen (char *s [r3]) */ + +#ifndef STRLEN +# define STRLEN strlen +#endif + +/* TODO: change this to .machine power8 when the minimum required binutils + allows it. */ + .machine power7 +EALIGN (STRLEN, 4, 0) + CALL_MCOUNT 1 + dcbt 0,r3 + clrrdi r4,r3,3 /* Align the address to doubleword boundary. */ + rlwinm r6,r3,3,26,28 /* Calculate padding. */ + li r0,0 /* Doubleword with null chars to use + with cmpb. */ + li r5,-1 /* MASK = 0xffffffffffffffff. */ + ld r12,0(r4) /* Load doubleword from memory. */ +#ifdef __LITTLE_ENDIAN__ + sld r5,r5,r6 +#else + srd r5,r5,r6 /* MASK = MASK >> padding. */ +#endif + orc r9,r12,r5 /* Mask bits that are not part of the string. */ + cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */ + cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ + bne cr7,L(done) + + /* For shorter strings (< 64 bytes), we will not use vector registers, + as the overhead isn't worth it. So, let's use GPRs instead. This + will be done the same way as we do in the POWER7 implementation. + Let's see if we are aligned to a quadword boundary. If so, we can + jump to the first (non-vectorized) loop. Otherwise, we have to + handle the next DWORD first. */ + mtcrf 0x01,r4 + mr r9,r4 + addi r9,r9,8 + bt 28,L(align64) + + /* Handle the next 8 bytes so we are aligned to a quadword + boundary. */ + ldu r5,8(r4) + cmpb r10,r5,r0 + cmpdi cr7,r10,0 + addi r9,r9,8 + bne cr7,L(done) + +L(align64): + /* Proceed to the old (POWER7) implementation, checking two doublewords + per iteraction. For the first 56 bytes, we will just check for null + characters. After that, we will also check if we are 64-byte aligned + so we can jump to the vectorized implementation. We will unroll + these loops to avoid excessive branching. */ + ld r6,8(r4) + ldu r5,16(r4) + cmpb r10,r6,r0 + cmpb r11,r5,r0 + or r5,r10,r11 + cmpdi cr7,r5,0 + addi r9,r9,16 + bne cr7,L(dword_zero) + + ld r6,8(r4) + ldu r5,16(r4) + cmpb r10,r6,r0 + cmpb r11,r5,r0 + or r5,r10,r11 + cmpdi cr7,r5,0 + addi r9,r9,16 + bne cr7,L(dword_zero) + + ld r6,8(r4) + ldu r5,16(r4) + cmpb r10,r6,r0 + cmpb r11,r5,r0 + or r5,r10,r11 + cmpdi cr7,r5,0 + addi r9,r9,16 + bne cr7,L(dword_zero) + + /* Are we 64-byte aligned? If so, jump to the vectorized loop. + Note: aligning to 64-byte will necessarily slow down performance for + strings around 64 bytes in length due to the extra comparisons + required to check alignment for the vectorized loop. This is a + necessary tradeoff we are willing to take in order to speed up the + calculation for larger strings. */ + andi. r10,r9,63 + beq cr0,L(preloop) + ld r6,8(r4) + ldu r5,16(r4) + cmpb r10,r6,r0 + cmpb r11,r5,r0 + or r5,r10,r11 + cmpdi cr7,r5,0 + addi r9,r9,16 + bne cr7,L(dword_zero) + + andi. r10,r9,63 + beq cr0,L(preloop) + ld r6,8(r4) + ldu r5,16(r4) + cmpb r10,r6,r0 + cmpb r11,r5,r0 + or r5,r10,r11 + cmpdi cr7,r5,0 + addi r9,r9,16 + bne cr7,L(dword_zero) + + andi. r10,r9,63 + beq cr0,L(preloop) + ld r6,8(r4) + ldu r5,16(r4) + cmpb r10,r6,r0 + cmpb r11,r5,r0 + or r5,r10,r11 + cmpdi cr7,r5,0 + addi r9,r9,16 + bne cr7,L(dword_zero) + + andi. r10,r9,63 + beq cr0,L(preloop) + ld r6,8(r4) + ldu r5,16(r4) + cmpb r10,r6,r0 + cmpb r11,r5,r0 + or r5,r10,r11 + cmpdi cr7,r5,0 + addi r9,r9,16 + + /* At this point, we are necessarily 64-byte aligned. If no zeroes were + found, jump to the vectorized loop. */ + beq cr7,L(preloop) + +L(dword_zero): + /* OK, one (or both) of the doublewords contains a null byte. Check + the first doubleword and decrement the address in case the first + doubleword really contains a null byte. */ + + cmpdi cr6,r10,0 + addi r4,r4,-8 + bne cr6,L(done) + + /* The null byte must be in the second doubleword. Adjust the address + again and move the result of cmpb to r10 so we can calculate the + length. */ + + mr r10,r11 + addi r4,r4,8 + + /* If the null byte was found in the non-vectorized code, compute the + final length. r10 has the output of the cmpb instruction, that is, + it contains 0xff in the same position as the null byte in the + original doubleword from the string. Use that to calculate the + length. */ +L(done): +#ifdef __LITTLE_ENDIAN__ + addi r9, r10,-1 /* Form a mask from trailing zeros. */ + andc r9, r9,r10 + popcntd r0, r9 /* Count the bits in the mask. */ +#else + cntlzd r0,r10 /* Count leading zeros before the match. */ +#endif + subf r5,r3,r4 + srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */ + add r3,r5,r0 /* Compute final length. */ + blr + + /* Vectorized implementation starts here. */ + .p2align 4 +L(preloop): + /* Set up for the loop. */ + mr r4,r9 + li r7, 16 /* Load required offsets. */ + li r8, 32 + li r9, 48 + li r12, 8 + vxor v0,v0,v0 /* VR with null chars to use with + vcmpequb. */ + + /* Main loop to look for the end of the string. We will read in + 64-byte chunks. Align it to 32 bytes and unroll it 3 times to + leverage the icache performance. */ + .p2align 5 +L(loop): + lvx v1,r4,r0 /* Load 4 quadwords. */ + lvx v2,r4,r7 + lvx v3,r4,r8 + lvx v4,r4,r9 + vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ + vminub v6,v3,v4 + vminub v7,v5,v6 + vcmpequb. v7,v7,v0 /* Check for NULLs. */ + addi r4,r4,64 /* Adjust address for the next iteration. */ + bne cr6,L(vmx_zero) + + lvx v1,r4,r0 /* Load 4 quadwords. */ + lvx v2,r4,r7 + lvx v3,r4,r8 + lvx v4,r4,r9 + vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ + vminub v6,v3,v4 + vminub v7,v5,v6 + vcmpequb. v7,v7,v0 /* Check for NULLs. */ + addi r4,r4,64 /* Adjust address for the next iteration. */ + bne cr6,L(vmx_zero) + + lvx v1,r4,r0 /* Load 4 quadwords. */ + lvx v2,r4,r7 + lvx v3,r4,r8 + lvx v4,r4,r9 + vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ + vminub v6,v3,v4 + vminub v7,v5,v6 + vcmpequb. v7,v7,v0 /* Check for NULLs. */ + addi r4,r4,64 /* Adjust address for the next iteration. */ + beq cr6,L(loop) + +L(vmx_zero): + /* OK, we found a null byte. Let's look for it in the current 64-byte + block and mark it in its corresponding VR. */ + vcmpequb v1,v1,v0 + vcmpequb v2,v2,v0 + vcmpequb v3,v3,v0 + vcmpequb v4,v4,v0 + + /* We will now 'compress' the result into a single doubleword, so it + can be moved to a GPR for the final calculation. First, we + generate an appropriate mask for vbpermq, so we can permute bits into + the first halfword. */ + vspltisb v10,3 + lvsl v11,r0,r0 + vslb v10,v11,v10 + + /* Permute the first bit of each byte into bits 48-63. */ + VBPERMQ(v1,v1,v10) + VBPERMQ(v2,v2,v10) + VBPERMQ(v3,v3,v10) + VBPERMQ(v4,v4,v10) + + /* Shift each component into its correct position for merging. */ +#ifdef __LITTLE_ENDIAN__ + vsldoi v2,v2,v2,2 + vsldoi v3,v3,v3,4 + vsldoi v4,v4,v4,6 +#else + vsldoi v1,v1,v1,6 + vsldoi v2,v2,v2,4 + vsldoi v3,v3,v3,2 +#endif + + /* Merge the results and move to a GPR. */ + vor v1,v2,v1 + vor v2,v3,v4 + vor v4,v1,v2 + MFVRD(r10,v4) + + /* Adjust address to the begninning of the current 64-byte block. */ + addi r4,r4,-64 + +#ifdef __LITTLE_ENDIAN__ + addi r9, r10,-1 /* Form a mask from trailing zeros. */ + andc r9, r9,r10 + popcntd r0, r9 /* Count the bits in the mask. */ +#else + cntlzd r0,r10 /* Count leading zeros before the match. */ +#endif + subf r5,r3,r4 + add r3,r5,r0 /* Compute final length. */ + blr + +END (STRLEN) +libc_hidden_builtin_def (strlen) diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncase.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncase.S new file mode 100644 index 0000000000..32e09e4d94 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncase.S @@ -0,0 +1,20 @@ +/* Optimized strncasecmp implementation for POWER8. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define USE_AS_STRNCASECMP 1 +#include <sysdeps/powerpc/powerpc64/power8/strcasecmp.S> diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncmp.S new file mode 100644 index 0000000000..3d8df90538 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncmp.S @@ -0,0 +1,327 @@ +/* Optimized strncmp implementation for PowerPC64/POWER8. + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#ifndef STRNCMP +# define STRNCMP strncmp +#endif + +/* Implements the function + + int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n) + + The implementation uses unaligned doubleword access to avoid specialized + code paths depending of data alignment. Although recent powerpc64 uses + 64K as default, the page cross handling assumes minimum page size of + 4k. */ + + .machine power7 +EALIGN (STRNCMP, 4, 0) + /* Check if size is 0. */ + mr. r10,r5 + beq cr0,L(ret0) + + /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using + the code: + + (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE)) + + with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */ + rldicl r8,r3,0,52 + cmpldi cr7,r8,4096-16 + bgt cr7,L(pagecross) + rldicl r9,r4,0,52 + cmpldi cr7,r9,4096-16 + bgt cr7,L(pagecross) + + /* For short string up to 16 bytes, load both s1 and s2 using + unaligned dwords and compare. */ + ld r7,0(r3) + ld r9,0(r4) + li r8,0 + cmpb r8,r7,r8 + cmpb r6,r7,r9 + orc. r8,r8,r6 + bne cr0,L(different1) + + /* If the string compared are equal, but size is less or equal + to 8, return 0. */ + cmpldi cr7,r10,8 + li r9,0 + ble cr7,L(ret1) + addi r5,r10,-8 + + ld r7,8(r3) + ld r9,8(r4) + cmpb r8,r7,r8 + cmpb r6,r7,r9 + orc. r8,r8,r6 + bne cr0,L(different0) + + cmpldi cr7,r5,8 + mr r9,r8 + ble cr7,L(ret1) + + /* Update pointers and size. */ + addi r10,r10,-16 + addi r3,r3,16 + addi r4,r4,16 + + /* Now it has checked for first 16 bytes, align source1 to doubleword + and adjust source2 address. */ +L(align_8b): + rldicl r5,r3,0,61 + rldicr r3,r3,0,60 + subf r4,r5,r4 + add r10,r10,r5 + + /* At this point, source1 alignment is 0 and source2 alignment is + between 0 and 7. Check is source2 alignment is 0, meaning both + sources have the same alignment. */ + andi. r8,r4,0x7 + beq cr0,L(loop_eq_align_0) + + li r5,0 + b L(loop_ne_align_1) + + /* If source2 is unaligned to doubleword, the code needs to check + on each interation if the unaligned doubleword access will cross + a 4k page boundary. */ + .align 4 +L(loop_ne_align_0): + ld r7,0(r3) + ld r9,0(r4) + cmpb r8,r7,r5 + cmpb r6,r7,r9 + orc. r8,r8,r6 + bne cr0,L(different1) + + cmpldi cr7,r10,8 + ble cr7,L(ret0) + addi r10,r10,-8 + addi r3,r3,8 + addi r4,r4,8 +L(loop_ne_align_1): + rldicl r9,r4,0,52 + cmpldi r7,r9,4088 + ble cr7,L(loop_ne_align_0) + cmpdi cr7,r10,0 + beq cr7,L(ret0) + + lbz r9,0(r3) + lbz r8,0(r4) + cmplw cr7,r9,r8 + bne cr7,L(byte_ne_4) + cmpdi cr7,r9,0 + beq cr7,L(size_reached_0) + + li r9,r7 + addi r8,r3,1 + mtctr r9 + addi r4,r4,1 + addi r10,r10,-1 + addi r3,r3,8 + + /* The unaligned read of source2 will cross a 4K page boundary, + and the different byte or NULL maybe be in the remaining page + bytes. Since it can not use the unaligned load the algorithm + reads and compares 8 bytes to keep source1 doubleword aligned. */ + .align 4 +L(loop_ne_align_byte): + cmpdi cr7,r10,0 + addi r10,r10,-1 + beq cr7,L(ret0) + lbz r9,0(r8) + lbz r7,0(r4) + addi r8,r8,1 + addi r4,r4,1 + cmplw cr7,r9,r7 + cmpdi cr5,r9,0 + bne cr7,L(size_reached_2) + beq cr5,L(size_reached_0) + bdnz L(loop_ne_align_byte) + + cmpdi cr7,r10,0 + bne+ cr7,L(loop_ne_align_0) + + .align 4 +L(ret0): + li r9,0 +L(ret1): + mr r3,r9 + blr + + /* The code now check if r8 and r10 are different by issuing a + cmpb and shift the result based on its output: + + #ifdef __LITTLE_ENDIAN__ + leadzero = (__builtin_ffsl (z1) - 1); + leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero; + r1 = (r1 >> leadzero) & 0xFFUL; + r2 = (r2 >> leadzero) & 0xFFUL; + #else + leadzero = __builtin_clzl (z1); + leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero; + r1 = (r1 >> (56 - leadzero)) & 0xFFUL; + r2 = (r2 >> (56 - leadzero)) & 0xFFUL; + #endif + return r1 - r2; */ + + .align 4 +L(different0): + mr r10,r5 +#ifdef __LITTLE_ENDIAN__ +L(different1): + neg r11,r8 + sldi r10,r10,3 + and r8,r11,r8 + addi r10,r10,-8 + cntlzd r8,r8 + subfic r8,r8,63 + extsw r8,r8 + cmpld cr7,r8,r10 + ble cr7,L(different2) + mr r8,r10 +L(different2): + extsw r8,r8 +#else +L(different1): + addi r10,r10,-1 + cntlzd r8,r8 + sldi r10,r10,3 + cmpld cr7,r8,r10 + blt cr7,L(different2) + mr r8,r10 +L(different2): + subfic r8,r8,56 +#endif + srd r7,r7,r8 + srd r9,r9,r8 + rldicl r3,r7,0,56 + rldicl r9,r9,0,56 + subf r9,r9,3 + extsw r9,r9 + mr r3,r9 + blr + + /* If unaligned 16 bytes reads across a 4K page boundary, it uses + a simple byte a byte comparison until the page alignment for s1 + is reached. */ + .align 4 +L(pagecross): + lbz r7,0(r3) + lbz r9,0(r4) + subfic r8,r8,4095 + cmplw cr7,r9,r7 + bne cr7,L(byte_ne_3) + cmpdi cr7,r9,0 + beq cr7,L(byte_ne_0) + addi r10,r10,-1 + subf r7,r8,r10 + subf r9,r7,r10 + addi r9,r9,1 + mtctr r9 + b L(pagecross_loop1) + + .align 4 +L(pagecross_loop0): + beq cr7,L(ret0) + lbz r9,0(r3) + lbz r8,0(r4) + addi r10,r10,-1 + cmplw cr7,r9,r8 + cmpdi cr5,r9,0 + bne r7,L(byte_ne_2) + beq r5,L(byte_ne_0) +L(pagecross_loop1): + cmpdi cr7,r10,0 + addi r3,r3,1 + addi r4,r4,1 + bdnz L(pagecross_loop0) + cmpdi cr7,r7,0 + li r9,0 + bne+ cr7,L(align_8b) + b L(ret1) + + /* If both source1 and source2 are doubleword aligned, there is no + need for page boundary cross checks. */ + .align 4 +L(loop_eq_align_0): + ld r7,0(r3) + ld r9,0(r4) + cmpb r8,r7,r8 + cmpb r6,r7,r9 + orc. r8,r8,r6 + bne cr0,L(different1) + + cmpldi cr7,r10,8 + ble cr7,L(ret0) + addi r9,r10,-9 + + li r5,0 + srdi r9,r9,3 + addi r9,r9,1 + mtctr r9 + b L(loop_eq_align_2) + + .align 4 +L(loop_eq_align_1): + bdz L(ret0) +L(loop_eq_align_2): + ldu r7,8(r3) + addi r10,r10,-8 + ldu r9,8(r4) + cmpb r8,r7,r5 + cmpb r6,r7,r9 + orc. r8,r8,r6 + beq cr0,L(loop_eq_align_1) + b L(different1) + + .align 4 +L(byte_ne_0): + li r7,0 +L(byte_ne_1): + subf r9,r9,r7 + extsw r9,r9 + b L(ret1) + + .align 4 +L(byte_ne_2): + extsw r7,r9 + mr r9,r8 + b L(byte_ne_1) +L(size_reached_0): + li r10,0 +L(size_reached_1): + subf r9,r9,r10 + extsw r9,r9 + b L(ret1) +L(size_reached_2): + extsw r10,r9 + mr r9,r7 + b L(size_reached_1) +L(byte_ne_3): + extsw r7,r7 + b L(byte_ne_1) +L(byte_ne_4): + extsw r10,r9 + mr r9,r8 + b L(size_reached_1) +END(STRNCMP) +libc_hidden_builtin_def(strncmp) diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncpy.S new file mode 100644 index 0000000000..6d40f30ff7 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strncpy.S @@ -0,0 +1,465 @@ +/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8. + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#ifdef USE_AS_STPNCPY +# ifndef STPNCPY +# define FUNC_NAME __stpncpy +# else +# define FUNC_NAME STPNCPY +# endif +#else +# ifndef STRNCPY +# define FUNC_NAME strncpy +# else +# define FUNC_NAME STRNCPY +# endif +#endif /* !USE_AS_STPNCPY */ + +#ifndef MEMSET +/* For builds without IFUNC support, local calls should be made to internal + GLIBC symbol (created by libc_hidden_builtin_def). */ +# ifdef SHARED +# define MEMSET __GI_memset +# else +# define MEMSET memset +# endif +#endif + +#define FRAMESIZE (FRAME_MIN_SIZE+48) + +/* Implements the function + + char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) + + or + + char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5]) + + if USE_AS_STPCPY is defined. + + The implementation uses unaligned doubleword access to avoid specialized + code paths depending of data alignment. Although recent powerpc64 uses + 64K as default, the page cross handling assumes minimum page size of + 4k. */ + + .machine power7 +EALIGN (FUNC_NAME, 4, 0) + + /* Check if the [src]+15 will cross a 4K page by checking if the bit + indicating the page size changes. Basically: + + uint64_t srcin = (uint64_t)src; + uint64_t ob = srcin & 4096UL; + uint64_t nb = (srcin+15UL) & 4096UL; + if (ob ^ nb) + goto pagecross; */ + + addi r10,r4,16 + rlwinm r9,r4,0,19,19 + + /* Save some non-volatile registers on the stack. */ + std r26,-48(r1) + std r27,-40(r1) + + rlwinm r8,r10,0,19,19 + + std r28,-32(r1) + std r29,-24(r1) + + cmpld cr7,r9,r8 + + std r30,-16(r1) + std r31,-8(r1) + + /* Update CFI. */ + cfi_offset(r26, -48) + cfi_offset(r27, -40) + cfi_offset(r28, -32) + cfi_offset(r29, -24) + cfi_offset(r30, -16) + cfi_offset(r31, -8) + + beq cr7,L(unaligned_lt_16) + rldicl r9,r4,0,61 + subfic r8,r9,8 + cmpld cr7,r5,r8 + bgt cr7,L(pagecross) + + /* At this points there is 1 to 15 bytes to check and write. Since it could + be either from first unaligned 16 bytes access or from bulk copy, the code + uses an unrolled byte read/write instead of trying to analyze the cmpb + results. */ +L(short_path): + mr r9,r3 +L(short_path_1): + /* Return if there are no more bytes to be written. */ + cmpdi cr7,r5,0 + beq cr7,L(short_path_loop_end_1) +L(short_path_2): + /* Copy one char from src (r4) and write it to dest (r9). If it is the + end-of-string, start the null padding. Continue, otherwise. */ + lbz r10,0(r4) + cmpdi cr7,r10,0 + stb r10,0(r9) + beq cr7,L(zero_pad_start_1) + /* If there are no more bytes to be written, return. */ + cmpdi cr0,r5,1 + addi r8,r9,1 + addi r6,r5,-1 + beq cr0,L(short_path_loop_end_0) + /* Copy another char from src (r4) to dest (r9). Check again if it is + the end-of-string. If so, start the null padding. */ + lbz r10,1(r4) + cmpdi cr7,r10,0 + stb r10,1(r9) + beq cr7,L(zero_pad_start_prepare_1) + /* Eagerly decrement r5 by 3, which is the number of bytes already + written, plus one write that will be performed later on. */ + addi r10,r5,-3 + b L(short_path_loop_1) + + .align 4 +L(short_path_loop): + /* At this point, the induction variable, r5, as well as the pointers + to dest and src (r9 and r4, respectivelly) have been updated. + + Note: The registers r7 and r10 are induction variables derived from + r5. They are used to determine if the total number of writes has + been reached at every other write. + + Copy one char from src (r4) and write it to dest (r9). If it is the + end-of-string, start the null padding. Continue, otherwise. */ + lbz r8,0(r4) + addi r7,r10,-2 + cmpdi cr5,r8,0 + stb r8,0(r9) + beq cr5,L(zero_pad_start_1) + beq cr7,L(short_path_loop_end_0) + /* Copy another char from src (r4) to dest (r9). Check again if it is + the end-of-string. If so, start the null padding. */ + lbz r8,1(r4) + cmpdi cr7,r8,0 + stb r8,1(r9) + beq cr7,L(zero_pad_start) + mr r10,r7 +L(short_path_loop_1): + /* This block is reached after two chars have been already written to + dest. Nevertheless, r5 (the induction variable), r9 (the pointer to + dest), and r4 (the pointer to src) have not yet been updated. + + At this point: + r5 holds the count of bytes yet to be written plus 2. + r9 points to the last two chars that were already written to dest. + r4 points to the last two chars that were already copied from src. + + The algorithm continues by decrementing r5, the induction variable, + so that it reflects the last two writes. The pointers to dest (r9) + and to src (r4) are increment by two, for the same reason. + + Note: Register r10 is another induction variable, derived from r5, + which determines if the total number of writes has been reached. */ + addic. r5,r5,-2 + addi r9,r9,2 + cmpdi cr7,r10,0 /* Eagerly check if the next write is the last. */ + addi r4,r4,2 + addi r6,r9,1 + bne cr0,L(short_path_loop) /* Check if the total number of writes + has been reached at every other + write. */ +#ifdef USE_AS_STPNCPY + mr r3,r9 + b L(short_path_loop_end) +#endif + +L(short_path_loop_end_0): +#ifdef USE_AS_STPNCPY + addi r3,r9,1 + b L(short_path_loop_end) +#endif +L(short_path_loop_end_1): +#ifdef USE_AS_STPNCPY + mr r3,r9 +#endif +L(short_path_loop_end): + /* Restore non-volatile registers. */ + ld r26,-48(r1) + ld r27,-40(r1) + ld r28,-32(r1) + ld r29,-24(r1) + ld r30,-16(r1) + ld r31,-8(r1) + blr + + /* This code pads the remainder of dest with NULL bytes. The algorithm + calculates the remaining size and calls memset. */ + .align 4 +L(zero_pad_start): + mr r5,r10 + mr r9,r6 +L(zero_pad_start_1): + /* At this point: + - r5 holds the number of bytes that still have to be written to + dest. + - r9 points to the position, in dest, where the first null byte + will be written. + The above statements are true both when control reaches this label + from a branch or when falling through the previous lines. */ +#ifndef USE_AS_STPNCPY + mr r30,r3 /* Save the return value of strncpy. */ +#endif + /* Prepare the call to memset. */ + mr r3,r9 /* Pointer to the area to be zero-filled. */ + li r4,0 /* Byte to be written (zero). */ + + /* We delayed the creation of the stack frame, as well as the saving of + the link register, because only at this point, we are sure that + doing so is actually needed. */ + + /* Save the link register. */ + mflr r0 + std r0,16(r1) + cfi_offset(lr, 16) + + /* Create the stack frame. */ + stdu r1,-FRAMESIZE(r1) + cfi_adjust_cfa_offset(FRAMESIZE) + + bl MEMSET + nop + + /* Restore the stack frame. */ + addi r1,r1,FRAMESIZE + cfi_adjust_cfa_offset(-FRAMESIZE) + /* Restore the link register. */ + ld r0,16(r1) + mtlr r0 + +#ifndef USE_AS_STPNCPY + mr r3,r30 /* Restore the return value of strncpy, i.e.: + dest. For stpncpy, the return value is the + same as return value of memset. */ +#endif + + /* Restore non-volatile registers and return. */ + ld r26,-48(r1) + ld r27,-40(r1) + ld r28,-32(r1) + ld r29,-24(r1) + ld r30,-16(r1) + ld r31,-8(r1) + blr + + /* The common case where [src]+16 will not cross a 4K page boundary. + In this case the code fast check the first 16 bytes by using doubleword + read/compares and update destiny if neither total size or null byte + is found in destiny. */ + .align 4 +L(unaligned_lt_16): + cmpldi cr7,r5,7 + ble cr7,L(short_path) + ld r7,0(r4) + li r8,0 + cmpb r8,r7,r8 + cmpdi cr7,r8,0 + bne cr7,L(short_path_prepare_2) + addi r6,r5,-8 + std r7,0(r3) + addi r9,r3,8 + cmpldi cr7,r6,7 + addi r7,r4,8 + ble cr7,L(short_path_prepare_1_1) + ld r4,8(r4) + cmpb r8,r4,r8 + cmpdi cr7,r8,0 + bne cr7,L(short_path_prepare_2_1) + std r4,8(r3) + addi r29,r3,16 + addi r5,r5,-16 + /* Neither the null byte was found or total length was reached, + align to 16 bytes and issue a bulk copy/compare. */ + b L(align_to_16b) + + /* In the case of 4k page boundary cross, the algorithm first align + the address to a doubleword, calculate a mask based on alignment + to ignore the bytes and continue using doubleword. */ + .align 4 +L(pagecross): + rldicr r11,r4,0,59 /* Align the address to 8 bytes boundary. */ + li r6,-1 /* MASK = 0xffffffffffffffffUL. */ + sldi r9,r9,3 /* Calculate padding. */ + ld r7,0(r11) /* Load doubleword from memory. */ +#ifdef __LITTLE_ENDIAN__ + sld r9,r6,r9 /* MASK = MASK << padding. */ +#else + srd r9,r6,r9 /* MASK = MASK >> padding. */ +#endif + orc r9,r7,r9 /* Mask bits that are not part of the + string. */ + li r7,0 + cmpb r9,r9,r7 /* Check for null bytes in DWORD1. */ + cmpdi cr7,r9,0 + bne cr7,L(short_path_prepare_2) + subf r8,r8,r5 /* Adjust total length. */ + cmpldi cr7,r8,8 /* Check if length was reached. */ + ble cr7,L(short_path_prepare_2) + + /* For next checks we have aligned address, so we check for more + three doublewords to make sure we can read 16 unaligned bytes + to start the bulk copy with 16 aligned addresses. */ + ld r7,8(r11) + cmpb r9,r7,r9 + cmpdi cr7,r9,0 + bne cr7,L(short_path_prepare_2) + addi r7,r8,-8 + cmpldi cr7,r7,8 + ble cr7,L(short_path_prepare_2) + ld r7,16(r11) + cmpb r9,r7,r9 + cmpdi cr7,r9,0 + bne cr7,L(short_path_prepare_2) + addi r8,r8,-16 + cmpldi cr7,r8,8 + ble cr7,L(short_path_prepare_2) + ld r8,24(r11) + cmpb r9,r8,r9 + cmpdi cr7,r9,0 + bne cr7,L(short_path_prepare_2) + + /* No null byte found in the 32 bytes readed and length not reached, + read source again using unaligned loads and store them. */ + ld r9,0(r4) + addi r29,r3,16 + addi r5,r5,-16 + std r9,0(r3) + ld r9,8(r4) + std r9,8(r3) + + /* Align source to 16 bytes and adjust destiny and size. */ +L(align_to_16b): + rldicl r9,r10,0,60 + rldicr r28,r10,0,59 + add r12,r5,r9 + subf r29,r9,r29 + + /* The bulk read/compare/copy loads two doublewords, compare and merge + in a single register for speed. This is an attempt to speed up the + null-checking process for bigger strings. */ + + cmpldi cr7,r12,15 + ble cr7,L(short_path_prepare_1_2) + + /* Main loop for large sizes, unrolled 2 times to get better use of + pipeline. */ + ld r8,0(28) + ld r10,8(28) + li r9,0 + cmpb r7,r8,r9 + cmpb r9,r10,r9 + or. r6,r9,r7 + bne cr0,L(short_path_prepare_2_3) + addi r5,r12,-16 + addi r4,r28,16 + std r8,0(r29) + std r10,8(r29) + cmpldi cr7,r5,15 + addi r9,r29,16 + ble cr7,L(short_path_1) + mr r11,r28 + mr r6,r29 + li r30,0 + subfic r26,r4,48 + subfic r27,r9,48 + + b L(loop_16b) + + .align 4 +L(loop_start): + ld r31,0(r11) + ld r10,8(r11) + cmpb r0,r31,r7 + cmpb r8,r10,r7 + or. r7,r0,r8 + addi r5,r5,-32 + cmpldi cr7,r5,15 + add r4,r4,r26 + add r9,r9,r27 + bne cr0,L(short_path_prepare_2_2) + add r4,r28,r4 + std r31,0(r6) + add r9,r29,r9 + std r10,8(r6) + ble cr7,L(short_path_1) + +L(loop_16b): + ld r10,16(r11) + ld r0,24(r11) + cmpb r8,r10,r30 + cmpb r7,r0,r30 + or. r7,r8,r7 + addi r12,r12,-32 + cmpldi cr7,r12,15 + addi r11,r11,32 + bne cr0,L(short_path_2) + std r10,16(r6) + addi r6,r6,32 + std r0,-8(r6) + bgt cr7,L(loop_start) + + mr r5,r12 + mr r4,r11 + mr r9,r6 + b L(short_path_1) + + .align 4 +L(short_path_prepare_1_1): + mr r5,r6 + mr r4,r7 + b L(short_path_1) +L(short_path_prepare_1_2): + mr r5,r12 + mr r4,r28 + mr r9,r29 + b L(short_path_1) +L(short_path_prepare_2): + mr r9,r3 + b L(short_path_2) +L(short_path_prepare_2_1): + mr r5,r6 + mr r4,r7 + b L(short_path_2) +L(short_path_prepare_2_2): + mr r5,r12 + mr r4,r11 + mr r9,r6 + b L(short_path_2) +L(short_path_prepare_2_3): + mr r5,r12 + mr r4,r28 + mr r9,r29 + b L(short_path_2) +L(zero_pad_start_prepare_1): + mr r5,r6 + mr r9,r8 + b L(zero_pad_start_1) +END (FUNC_NAME) + +#ifndef USE_AS_STPNCPY +libc_hidden_builtin_def (strncpy) +#endif diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strnlen.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strnlen.S new file mode 100644 index 0000000000..3eadbfb09e --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strnlen.S @@ -0,0 +1,433 @@ +/* Optimized strnlen implementation for POWER8 using a vmx loop. + + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* It is implemented the following heuristic: + 1. Case maxlen <= 32: align the pointer to 8 bytes to loop through + reading doublewords. Uses the POWER7 algorithm. + 2. Case maxlen > 32: check for null bytes in the first 16 bytes using + unaligned accesses. Return length if found. Otherwise: + 2.1 Case maxlen < 64: deduct the bytes previously read, align + the pointer to 16 bytes and loop through reading quadwords + until find null bytes or reach maxlen. + 2.2 Case maxlen > 64: deduct the bytes previously read, align + the pointer to 64 bytes and set up a counter to loop through + reading in strides of 64 bytes. In case it finished the loop + with null bytes not found, process the remainder bytes by + switching to the loop to heuristic in 2.1. */ + +#include <sysdep.h> + +/* Define default page size to 4KB. */ +#define PAGE_SIZE 4096 + +/* The following macros implement Power ISA v2.07 opcodes + that could not be used directly into this code to the keep + compatibility with older binutils versions. */ + +/* Move from vector register doubleword. */ +#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) + +/* Move to vector register doubleword. */ +#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16))) + +/* Vector Bit Permute Quadword. */ +#define VBPERMQ(t,a,b) .long (0x1000054c \ + | ((t)<<(32-11)) \ + | ((a)<<(32-16)) \ + | ((b)<<(32-21)) ) + +/* Vector Population Count Halfword. */ +#define VPOPCNTH(t,b) .long (0x10000743 | ((t)<<(32-11)) | ((b)<<(32-21))) + +/* Vector Count Leading Zeros Halfword. */ +#define VCLZH(t,b) .long (0x10000742 | ((t)<<(32-11)) | ((b)<<(32-21))) + + +/* int [r3] strnlen (char *s [r3], size_t maxlen [r4]) */ +/* TODO: change to power8 when minimum required binutils allows it. */ + .machine power7 +ENTRY (__strnlen) + CALL_MCOUNT 2 + dcbt 0,r3 + + cmpldi r4,32 /* Check if maxlen <= 32. */ + ble L(small_range) /* If maxlen <= 32. */ + + /* Upcoming 16 bytes unaligned accesses cannot cross the page boundary + otherwise the processor throws an memory access error. + Use following code to check there is room for such as accesses: + (((size_t) s) % PAGE_SIZE > (PAGE_SIZE - 16) + If it is disallowed then switch to the code that handles + the string when maxlen <= 32. */ + clrldi r10,r3,52 + cmpldi cr7,r10,PAGE_SIZE-16 + bgt cr7,L(small_range) /* If less than 16B of page end. */ + + /* Compute our permute constant r8. */ + li r7,0 + /* Compute a bpermd constant to move bit 0 of each word into + a halfword value, and count trailing zeros. */ +#ifdef __LITTLE_ENDIAN__ + li r8,0x2820 + oris r8,r8,0x3830 + sldi r8,r8,32 + ori r8,r8,0x0800 + oris r8,r8,0x1810 +#else + li r8,0x1018 + oris r8,r8,0x0008 + sldi r8,r8,32 + ori r8,r8,0x3038 + oris r8,r8,0x2028 +#endif + + /* maxlen > 32. Optimistically check for null bytes in the first + 16 bytes of the string using unaligned accesses. */ + ld r5,0(r3) + ld r6,8(r3) + cmpb r10,r7,r5 /* Check for null bytes in DWORD1. */ + cmpb r11,r7,r6 /* Check for null bytes in DWORD2. */ + or. r7,r10,r11 + bne cr0, L(early_find) /* If found null bytes. */ + + /* At this point maxlen > 32 and null bytes were not found at first + 16 bytes. Prepare for loop using VMX. */ + + /* r3 == s, r4 == maxlen. All other volatile regs are unused now. */ + + addi r5,r3,16 /* Align up, or just add the 16B we + already checked. */ + li r0,15 + and r7,r5,r0 /* Find offset into 16B alignment. */ + andc r5,r5,r0 /* Quadword align up s to the next quadword. */ + li r0,16 + subf r0,r7,r0 + subf r4,r0,r4 /* Deduct unaligned bytes from maxlen. */ + + + /* Compute offsets for vmx loads, and precompute the vbpermq + constants for both the 64B and 16B loops. */ + li r6,0 + vspltisb v0,0 + vspltisb v10,3 + lvsl v11,r6,r6 + vslb v10,v11,v10 + + cmpldi r4,64 /* Check maxlen < 64. */ + blt L(smaller) /* If maxlen < 64 */ + + /* In order to begin the 64B loop, it needs to be 64 + bytes aligned. So read quadwords until it is aligned or found null + bytes. At worst case it will be aligned after the fourth iteration, + so unroll the loop to avoid counter checking. */ + andi. r7,r5,63 /* Check if is 64 bytes aligned. */ + beq cr0,L(preloop_64B) /* If it is already 64B aligned. */ + lvx v1,r5,r6 + vcmpequb. v1,v1,v0 + addi r5,r5,16 + addi r4,r4,-16 /* Decrement maxlen in 16 bytes. */ + bne cr6,L(found_aligning64B) /* If found null bytes. */ + + /* Unroll 3x above code block until aligned or find null bytes. */ + andi. r7,r5,63 + beq cr0,L(preloop_64B) + lvx v1,r5,r6 + vcmpequb. v1,v1,v0 + addi r5,r5,16 + addi r4,r4,-16 + bne cr6,L(found_aligning64B) + + andi. r7,r5,63 + beq cr0,L(preloop_64B) + lvx v1,r5,r6 + vcmpequb. v1,v1,v0 + addi r5,r5,16 + addi r4,r4,-16 + bne cr6,L(found_aligning64B) + + andi. r7,r5,63 + beq cr0,L(preloop_64B) + lvx v1,r5,r6 + vcmpequb. v1,v1,v0 + addi r5,r5,16 + addi r4,r4,-16 + bne cr6,L(found_aligning64B) + + /* At this point it should be 16 bytes aligned. + Prepare for the 64B loop. */ + .p2align 4 +L(preloop_64B): + /* Check if maxlen became is less than 64, therefore disallowing the + 64B loop. If it happened switch to the 16B loop code. */ + cmpldi r4,64 /* Check if maxlen < 64. */ + blt L(smaller) /* If maxlen < 64. */ + /* Set some constant values. */ + li r7,16 + li r10,32 + li r9,48 + + /* Compute the number of 64 bytes iterations needed. */ + srdi r11,r4,6 /* Compute loop count (maxlen / 64). */ + andi. r4,r4,63 /* Set maxlen the remainder (maxlen % 64). */ + mtctr r11 /* Move loop count to counter register. */ + + /* Handle maxlen > 64. Loop over the bytes in strides of 64B. */ + .p2align 4 +L(loop_64B): + lvx v1,r5,r6 /* r5 is the pointer to s. */ + lvx v2,r5,r7 + lvx v3,r5,r10 + lvx v4,r5,r9 + /* Compare the four 16B vectors to obtain the least 16 values. + Null bytes should emerge into v7, then check for null bytes. */ + vminub v5,v1,v2 + vminub v6,v3,v4 + vminub v7,v5,v6 + vcmpequb. v7,v7,v0 /* Check for null bytes. */ + addi r5,r5,64 /* Add pointer to next iteraction. */ + bne cr6,L(found_64B) /* If found null bytes. */ + bdnz L(loop_64B) /* Continue the loop if count > 0. */ + +/* Hit loop end without null match. So branch to handle the remainder. */ + + /* Prepare a 16B loop to handle two cases: + 1. If 32 > maxlen < 64. + 2. If maxlen >= 64, and reached end of the 64B loop with null + bytes not found. Thus handle the remainder bytes here. */ + .p2align 4 +L(smaller): + cmpldi r4,0 /* Check maxlen is zero. */ + beq L(done) /* If maxlen is zero. */ + + /* Place rounded up number of qw's to check into a vmx + register, and use some vector tricks to minimize + branching. */ + MTVRD(v7,r4) /* Copy maxlen from GPR to vector register. */ + vspltisb v5,1 + vspltisb v6,15 + vspltb v2,v7,7 + vaddubs v3,v5,v6 + +#ifdef __LITTLE_ENDIAN__ + vspltish v5,1 /* Compute 16 in each byte. */ +#endif + + /* Loop in 16B aligned incremements now. */ + .p2align 4 +L(loop_16B): + lvx v1,r5,r6 /* Load quadword into vector register. */ + addi r5,r5,16 /* Increment address to next 16B block. */ + vor v7,v2,v2 /* Save loop count (v2) into v7. */ + vsububs v2,v2,v3 /* Subtract 16B from count, saturate at 0. */ + vminub v4,v1,v2 + vcmpequb. v4,v4,v0 /* Checking for null bytes. */ + beq cr6,L(loop_16B) /* If null bytes not found. */ + + vcmpequb v1,v1,v0 + VBPERMQ(v1,v1,v10) +#ifdef __LITTLE_ENDIAN__ + vsubuhm v2,v1,v5 /* Form a mask of trailing zeros. */ + vandc v2,v2,v1 + VPOPCNTH(v1,v2) /* Count of trailing zeros, 16 if none. */ +#else + VCLZH(v1,v1) /* Count the leading zeros, 16 if none. */ +#endif + /* Truncate to maximum allowable offset. */ + vcmpgtub v2,v1,v7 /* Compare and truncate for matches beyond + maxlen. */ + vsel v1,v1,v7,v2 /* 0-16 is now in byte 7. */ + + MFVRD(r0,v1) + addi r5,r5,-16 /* Undo speculative bump. */ + extsb r0,r0 /* Clear whatever gunk is in the high 56b. */ + add r5,r5,r0 /* Add the offset of whatever was found. */ +L(done): + subf r3,r3,r5 /* Length is equal to the offset of null byte + matched minus the pointer to s. */ + blr /* Done. */ + + /* Handle case of maxlen > 64 and found null bytes in last block + of 64 bytes read. */ + .p2align 4 +L(found_64B): + /* A zero was found. Reduce the result. */ + vcmpequb v1,v1,v0 + vcmpequb v2,v2,v0 + vcmpequb v3,v3,v0 + vcmpequb v4,v4,v0 + + /* Permute the first bit of each byte into bits 48-63. */ + VBPERMQ(v1,v1,v10) + VBPERMQ(v2,v2,v10) + VBPERMQ(v3,v3,v10) + VBPERMQ(v4,v4,v10) + + /* Shift each component into its correct position for merging. */ +#ifdef __LITTLE_ENDIAN__ + vsldoi v2,v2,v2,2 + vsldoi v3,v3,v3,4 + vsldoi v4,v4,v4,6 +#else + vsldoi v1,v1,v1,6 + vsldoi v2,v2,v2,4 + vsldoi v3,v3,v3,2 +#endif + + /* Merge the results and move to a GPR. */ + vor v1,v2,v1 + vor v2,v3,v4 + vor v4,v1,v2 + + /* Adjust address to the start of the current 64B block. */ + addi r5,r5,-64 + + MFVRD(r10,v4) +#ifdef __LITTLE_ENDIAN__ + addi r9,r10,-1 /* Form a mask from trailing zeros. */ + andc r9,r9,r10 + popcntd r0,r9 /* Count the bits in the mask. */ +#else + cntlzd r0,r10 /* Count leading zeros before the match. */ +#endif + subf r5,r3,r5 + add r3,r5,r0 /* Compute final length. */ + blr /* Done. */ + + /* Handle case where null bytes were found while aligning + as a preparation for the 64B loop. */ + .p2align 4 +L(found_aligning64B): + VBPERMQ(v1,v1,v10) +#ifdef __LITTLE_ENDIAN__ + MFVRD(r10,v1) + addi r9,r10,-1 /* Form a mask from trailing zeros. */ + andc r9,r9,r10 + popcntd r0,r9 /* Count the bits in the mask. */ +#else + vsldoi v1,v1,v1,6 + MFVRD(r10,v1) + cntlzd r0,r10 /* Count leading zeros before the match. */ +#endif + addi r5,r5,-16 /* Adjust address to offset of last 16 bytes + read. */ + /* Calculate length as subtracted the pointer to s of last 16 bytes + offset, added with the bytes before the match. */ + subf r5,r3,r5 + add r3,r5,r0 + blr /* Done. */ + + /* Handle case of maxlen > 32 and found a null bytes within the first + 16 bytes of s. */ + .p2align 4 +L(early_find): + bpermd r5,r8,r10 /* r8 contains the bit permute constants. */ + bpermd r6,r8,r11 + sldi r5,r5,8 + or r5,r5,r6 /* r5 should hold a 16B mask of + a potential 0. */ + cntlzd r5,r5 /* Count leading zeros. */ + addi r3,r5,-48 /* Deduct the 48 leading zeros always + present. */ + blr /* Done. */ + + /* Handle case of maxlen <= 32. Use the POWER7 algorithm. */ + .p2align 4 +L(small_range): + clrrdi r8,r3,3 /* Align the pointer to 8B. */ + li r0,0 + /* Register's content at this point: + r3 == pointer to s, r4 == maxlen, r8 == pointer to s aligned to 8B, + r7 == last acceptable address. */ + cmpldi r4,0 /* Check if maxlen is zero. */ + beq L(end_max) /* If maxlen is zero. */ + + /* Calculate the last acceptable address and check for possible + addition overflow by using satured math: + r7 = r3 + r4 + r7 |= -(r7 < x) */ + add r7,r3,r4 + subfc r6,r3,r7 + subfe r9,r9,r9 + extsw r6,r9 + or r7,r7,r6 + addi r7,r7,-1 + + clrrdi r7,r7,3 /* Align to 8B address of last + acceptable address. */ + + rlwinm r6,r3,3,26,28 /* Calculate padding. */ + ld r12,0(r8) /* Load aligned doubleword. */ + cmpb r10,r12,r0 /* Check for null bytes. */ +#ifdef __LITTLE_ENDIAN__ + srd r10,r10,r6 + sld r10,r10,r6 +#else + sld r10,r10,r6 + srd r10,r10,r6 +#endif /* __LITTLE_ENDIAN__ */ + cmpldi cr7,r10,0 + bne cr7,L(done_small) /* If found null byte. */ + + cmpld r8,r7 /* Check if reached maxlen. */ + beq L(end_max) /* If reached maxlen. */ + + /* Still handling case of maxlen <= 32. Read doubleword aligned until + find null bytes or reach maxlen. */ + .p2align 4 +L(loop_small): + ldu r12,8(r8) /* Load next doubleword and update r8. */ + cmpb r10,r12,r0 /* Check for null bytes. */ + cmpldi cr6,r10,0 + bne cr6,L(done_small) /* If found null bytes. */ + cmpld r8,r7 /* Check if reached maxlen. */ + bne L(loop_small) /* If it has more bytes to read. */ + mr r3,r4 /* Reached maxlen with null bytes not found. + Length is equal to maxlen. */ + blr /* Done. */ + + /* Still handling case of maxlen <= 32. Found null bytes. + Registers: r10 == match bits within doubleword, r8 == address of + last doubleword read, r3 == pointer to s, r4 == maxlen. */ + .p2align 4 +L(done_small): +#ifdef __LITTLE_ENDIAN__ + /* Count trailing zeros. */ + addi r0,r10,-1 + andc r0,r0,r10 + popcntd r0,r0 +#else + cntlzd r0,r10 /* Count leading zeros before the match. */ +#endif + sub r3,r8,r3 /* Calculate total of bytes before the match. */ + srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */ + add r3,r3,r0 /* Length until the match. */ + cmpld r3,r4 /* Check length is greater than maxlen. */ + blelr + mr r3,r4 /* If length is greater than maxlen, return + maxlen. */ + blr + + /* Handle case of reached maxlen with null bytes not found. */ + .p2align 4 +L(end_max): + mr r3,r4 /* Length is equal to maxlen. */ + blr /* Done. */ + + +END (__strnlen) +libc_hidden_def (__strnlen) +weak_alias (__strnlen, strnlen) +libc_hidden_def (strnlen) diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strrchr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strrchr.S new file mode 100644 index 0000000000..8eb74853c3 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strrchr.S @@ -0,0 +1,464 @@ +/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* char *[r3] strrchr (char *s [r3], int c [r4]) */ +/* TODO: change these to the actual instructions when the minimum required + binutils allows it. */ +#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16))) +#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) +#define VBPERMQ(t,a,b) .long (0x1000054c \ + | ((t)<<(32-11)) \ + | ((a)<<(32-16)) \ + | ((b)<<(32-21)) ) +#define VCLZD(r,v) .long (0x100007c2 | ((r)<<(32-11)) | ((v)<<(32-21))) +#define VPOPCNTD(r,v) .long (0x100007c3 | ((r)<<(32-11)) | ((v)<<(32-21))) +#define VADDUQM(t,a,b) .long (0x10000100 \ + | ((t)<<(32-11)) \ + | ((a)<<(32-16)) \ + | ((b)<<(32-21)) ) +#ifdef __LITTLE_ENDIAN__ +/* Find the match position from v6 and place result in r6. */ +# define CALCULATE_MATCH() \ + VBPERMQ(v6, v6, v10); \ + vsldoi v6, v6, v6, 6; \ + MFVRD(r7, v6); \ + cntlzd r6, r7; \ + subfic r6, r6, 15; +/* + * Find the first null position to mask bytes after null. + * (reg): vcmpequb result: v2 for 1st qw v3 for 2nd qw. + * Result placed at v2. + */ +# define FIND_NULL_POS(reg) \ + vspltisb v11, -1; \ + VADDUQM(v11, reg, v11); \ + vandc v11, v11, reg; \ + VPOPCNTD(v2, v11); \ + vspltb v11, v2, 15; \ + vcmpequb. v11, v11, v9; \ + blt cr6, 1f; \ + vsldoi v9, v0, v9, 1; \ + vslo v2, v2, v9; \ +1: \ + vsumsws v2, v2, v0; +#else +# define CALCULATE_MATCH() \ + VBPERMQ(v6, v6, v10); \ + MFVRD(r7, v6); \ + addi r6, r7, -1; \ + andc r6, r6, r7; \ + popcntd r6, r6; \ + subfic r6, r6, 15; +# define FIND_NULL_POS(reg) \ + VCLZD(v2, reg); \ + vspltb v11, v2, 7; \ + vcmpequb. v11, v11, v9; \ + blt cr6, 1f; \ + vsldoi v9, v0, v9, 1; \ + vsro v2, v2, v9; \ +1: \ + vsumsws v2, v2, v0; +#endif /* !__LITTLE_ENDIAN__ */ + .machine power7 +ENTRY (strrchr) + CALL_MCOUNT 2 + dcbt 0,r3 + clrrdi r8,r3,3 /* Align the address to doubleword boundary. */ + cmpdi cr7,r4,0 + ld r12,0(r8) /* Load doubleword from memory. */ + li r9,0 /* Used to store last occurence. */ + li r0,0 /* Doubleword with null chars to use + with cmpb. */ + + rlwinm r6,r3,3,26,28 /* Calculate padding. */ + + beq cr7,L(null_match) + + /* Replicate byte to doubleword. */ + insrdi r4,r4,8,48 + insrdi r4,r4,16,32 + insrdi r4,r4,32,0 + + /* r4 is changed now. If it's passed more chars, then + check for null again. */ + cmpdi cr7,r4,0 + beq cr7,L(null_match) + /* Now r4 has a doubleword of c bytes and r0 has + a doubleword of null bytes. */ + + cmpb r10,r12,r4 /* Compare each byte against c byte. */ + cmpb r11,r12,r0 /* Compare each byte against null byte. */ + + /* Move the doublewords left and right to discard the bits that are + not part of the string and bring them back as zeros. */ +#ifdef __LITTLE_ENDIAN__ + srd r10,r10,r6 + srd r11,r11,r6 + sld r10,r10,r6 + sld r11,r11,r6 +#else + sld r10,r10,r6 + sld r11,r11,r6 + srd r10,r10,r6 + srd r11,r11,r6 +#endif + or r5,r10,r11 /* OR the results to speed things up. */ + cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes + have been found. */ + bne cr7,L(done) + +L(align): + andi. r12, r8, 15 + + /* Are we now aligned to a doubleword boundary? If so, skip to + the main loop. Otherwise, go through the alignment code. */ + + bne cr0, L(loop) + + /* Handle WORD2 of pair. */ + ldu r12,8(r8) + cmpb r10,r12,r4 + cmpb r11,r12,r0 + or r5,r10,r11 + cmpdi cr7,r5,0 + bne cr7,L(done) + b L(loop) /* We branch here (rather than falling through) + to skip the nops due to heavy alignment + of the loop below. */ + .p2align 5 +L(loop): + /* Load two doublewords, compare and merge in a + single register for speed. This is an attempt + to speed up the null-checking process for bigger strings. */ + ld r12,8(r8) + ldu r7,16(r8) + cmpb r10,r12,r4 + cmpb r11,r12,r0 + cmpb r6,r7,r4 + cmpb r7,r7,r0 + or r12,r10,r11 + or r5,r6,r7 + or r5,r12,r5 + cmpdi cr7,r5,0 + beq cr7,L(vector) + + /* OK, one (or both) of the doublewords contains a c/null byte. Check + the first doubleword and decrement the address in case the first + doubleword really contains a c/null byte. */ + cmpdi cr6,r12,0 + addi r8,r8,-8 + bne cr6,L(done) + + /* The c/null byte must be in the second doubleword. Adjust the + address again and move the result of cmpb to r10 so we can calculate + the pointer. */ + + mr r10,r6 + mr r11,r7 + addi r8,r8,8 + + /* r10/r11 have the output of the cmpb instructions, that is, + 0xff in the same position as the c/null byte in the original + doubleword from the string. Use that to calculate the pointer. */ + +L(done): + /* If there are more than one 0xff in r11, find the first position of + 0xff in r11 and fill r10 with 0 from that position. */ + cmpdi cr7,r11,0 + beq cr7,L(no_null) +#ifdef __LITTLE_ENDIAN__ + addi r3,r11,-1 + andc r3,r3,r11 + popcntd r0,r3 +#else + cntlzd r0,r11 +#endif + subfic r0,r0,63 + li r6,-1 +#ifdef __LITTLE_ENDIAN__ + srd r0,r6,r0 +#else + sld r0,r6,r0 +#endif + and r10,r0,r10 +L(no_null): +#ifdef __LITTLE_ENDIAN__ + cntlzd r0,r10 /* Count leading zeros before c matches. */ + addi r3,r10,-1 + andc r3,r3,r10 + addi r10,r11,-1 + andc r10,r10,r11 + cmpld cr7,r3,r10 + bgt cr7,L(no_match) +#else + addi r3,r10,-1 /* Count trailing zeros before c matches. */ + andc r3,r3,r10 + popcntd r0,r3 + cmpld cr7,r11,r10 + bgt cr7,L(no_match) +#endif + srdi r0,r0,3 /* Convert trailing zeros to bytes. */ + subfic r0,r0,7 + add r9,r8,r0 /* Return address of the matching c byte + or null in case c was not found. */ + li r0,0 + cmpdi cr7,r11,0 /* If r11 == 0, no null's have been found. */ + beq cr7,L(align) + + .align 4 +L(no_match): + mr r3,r9 + blr + +/* Check the first 32B in GPR's and move to vectorized loop. */ + .p2align 5 +L(vector): + addi r3, r8, 8 + /* Make sure 32B aligned. */ + andi. r10, r3, 31 + bne cr0, L(loop) + vspltisb v0, 0 + /* Precompute vbpermq constant. */ + vspltisb v10, 3 + lvsl v11, r0, r0 + vslb v10, v11, v10 + MTVRD(v1, r4) + li r5, 16 + vspltb v1, v1, 7 + /* Compare 32 bytes in each loop. */ +L(continue): + lvx v4, 0, r3 + lvx v5, r3, r5 + vcmpequb v2, v0, v4 + vcmpequb v3, v0, v5 + vcmpequb v6, v1, v4 + vcmpequb v7, v1, v5 + vor v8, v2, v3 + vor v9, v6, v7 + vor v11, v8, v9 + vcmpequb. v11, v0, v11 + addi r3, r3, 32 + blt cr6, L(continue) + vcmpequb. v8, v0, v8 + blt cr6, L(match) + + /* One (or both) of the quadwords contains c/null. */ + vspltisb v8, 2 + vspltisb v9, 5 + /* Precompute values used for comparison. */ + vsl v9, v8, v9 /* v9 = 0x4040404040404040. */ + vaddubm v8, v9, v9 + vsldoi v8, v0, v8, 1 /* v8 = 0x80. */ + + /* Check if null is in second qw. */ + vcmpequb. v11, v0, v2 + blt cr6, L(secondqw) + + /* Null found in first qw. */ + addi r8, r3, -32 + /* Calculate the null position. */ + FIND_NULL_POS(v2) + /* Check if null is in the first byte. */ + vcmpequb. v11, v0, v2 + blt cr6, L(no_match) + vsububm v2, v8, v2 + /* Mask unwanted bytes after null. */ +#ifdef __LITTLE_ENDIAN__ + vslo v6, v6, v2 + vsro v6, v6, v2 +#else + vsro v6, v6, v2 + vslo v6, v6, v2 +#endif + vcmpequb. v11, v0, v6 + blt cr6, L(no_match) + /* Found a match before null. */ + CALCULATE_MATCH() + add r3, r8, r6 + blr + +L(secondqw): + addi r8, r3, -16 + FIND_NULL_POS(v3) + vcmpequb. v11, v0, v2 + blt cr6, L(no_match1) + vsububm v2, v8, v2 + /* Mask unwanted bytes after null. */ +#ifdef __LITTLE_ENDIAN__ + vslo v7, v7, v2 + vsro v7, v7, v2 +#else + vsro v7, v7, v2 + vslo v7, v7, v2 +#endif + vcmpequb. v11, v0, v7 + blt cr6, L(no_match1) + addi r8, r8, 16 + vor v6, v0, v7 +L(no_match1): + addi r8, r8, -16 + vcmpequb. v11, v0, v6 + blt cr6, L(no_match) + /* Found a match before null. */ + CALCULATE_MATCH() + add r3, r8, r6 + blr + +L(match): + /* One (or both) of the quadwords contains a match. */ + mr r8, r3 + vcmpequb. v8, v0, v7 + blt cr6, L(firstqw) + /* Match found in second qw. */ + addi r8, r8, 16 + vor v6, v0, v7 +L(firstqw): + addi r8, r8, -32 + CALCULATE_MATCH() + add r9, r8, r6 /* Compute final length. */ + b L(continue) +/* We are here because strrchr was called with a null byte. */ + .align 4 +L(null_match): + /* r0 has a doubleword of null bytes. */ + + cmpb r5,r12,r0 /* Compare each byte against null bytes. */ + + /* Move the doublewords left and right to discard the bits that are + not part of the string and bring them back as zeros. */ +#ifdef __LITTLE_ENDIAN__ + srd r5,r5,r6 + sld r5,r5,r6 +#else + sld r5,r5,r6 + srd r5,r5,r6 +#endif + cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes + have been found. */ + bne cr7,L(done_null) + + andi. r12, r8, 15 + + /* Are we now aligned to a quadword boundary? If so, skip to + the main loop. Otherwise, go through the alignment code. */ + + bne cr0, L(loop_null) + + /* Handle WORD2 of pair. */ + ldu r12,8(r8) + cmpb r5,r12,r0 + cmpdi cr7,r5,0 + bne cr7,L(done_null) + b L(loop_null) /* We branch here (rather than falling through) + to skip the nops due to heavy alignment + of the loop below. */ + + /* Main loop to look for the end of the string. Since it's a + small loop (< 8 instructions), align it to 32-bytes. */ + .p2align 5 +L(loop_null): + /* Load two doublewords, compare and merge in a + single register for speed. This is an attempt + to speed up the null-checking process for bigger strings. */ + ld r12,8(r8) + ldu r11,16(r8) + cmpb r5,r12,r0 + cmpb r10,r11,r0 + or r6,r5,r10 + cmpdi cr7,r6,0 + beq cr7,L(vector1) + + /* OK, one (or both) of the doublewords contains a null byte. Check + the first doubleword and decrement the address in case the first + doubleword really contains a null byte. */ + + cmpdi cr6,r5,0 + addi r8,r8,-8 + bne cr6,L(done_null) + + /* The null byte must be in the second doubleword. Adjust the address + again and move the result of cmpb to r10 so we can calculate the + pointer. */ + + mr r5,r10 + addi r8,r8,8 + + /* r5 has the output of the cmpb instruction, that is, it contains + 0xff in the same position as the null byte in the original + doubleword from the string. Use that to calculate the pointer. */ +L(done_null): +#ifdef __LITTLE_ENDIAN__ + addi r0,r5,-1 + andc r0,r0,r5 + popcntd r0,r0 +#else + cntlzd r0,r5 /* Count leading zeros before the match. */ +#endif + srdi r0,r0,3 /* Convert trailing zeros to bytes. */ + add r3,r8,r0 /* Return address of the matching null byte. */ + blr +/* Check the first 32B in GPR's and move to vectorized loop. */ + .p2align 5 +L(vector1): + addi r3, r8, 8 + /* Make sure 32B aligned. */ + andi. r10, r3, 31 + bne cr0, L(loop_null) + vspltisb v0, 0 + /* Precompute vbpermq constant. */ + vspltisb v10, 3 + lvsl v11, r0, r0 + vslb v10, v11, v10 + li r5, 16 + /* Compare 32 bytes in each loop. */ +L(continue1): + lvx v4, 0, r3 + lvx v5, r3, r5 + vcmpequb v2, v0, v4 + vcmpequb v3, v0, v5 + vor v8, v2, v3 + vcmpequb. v11, v0, v8 + addi r3, r3, 32 + blt cr6, L(continue1) + addi r3, r3, -32 + VBPERMQ(v2, v2, v10) + VBPERMQ(v3, v3, v10) + /* Shift each component into its correct position for merging. */ +#ifdef __LITTLE_ENDIAN__ + vsldoi v3, v3, v3, 2 +#else + vsldoi v2, v2, v2, 6 + vsldoi v3, v3, v3, 4 +#endif + /* Merge the results and move to a GPR. */ + vor v4, v3, v2 + MFVRD(r5, v4) +#ifdef __LITTLE_ENDIAN__ + addi r6, r5, -1 + andc r6, r6, r5 + popcntd r6, r6 +#else + cntlzd r6, r5 /* Count leading zeros before the match. */ +#endif + add r3, r3, r6 /* Compute final length. */ + blr +END (strrchr) +weak_alias (strrchr, rindex) +libc_hidden_builtin_def (strrchr) diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strspn.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strspn.S new file mode 100644 index 0000000000..e9271898f2 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/strspn.S @@ -0,0 +1,202 @@ +/* Optimized strspn implementation for Power8. + + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* size_t [r3] strspn (const char *string [r3], + const char *needleAccept [r4]) */ + +/* This takes a novel approach by computing a 256 bit mask whereby + each set bit implies the byte is "accepted". P8 vector hardware + has extremely efficient hardware for selecting bits from a mask. + + One might ask "why not use bpermd for short strings"? It is + so slow that its performance about matches the generic PPC64 + variant without any fancy masking, with the added expense of + making the mask. That was the first variant of this. */ + + + +#include "sysdep.h" + +#ifndef USE_AS_STRCSPN +# define USE_AS_STRCSPN 0 +# ifndef STRSPN +# define STRSPN strspn +# endif +# define INITIAL_MASK 0 +# define UPDATE_MASK(RA, RS, RB) or RA, RS, RB +#else +# ifndef STRSPN +# define STRSPN strcspn +# endif +# define INITIAL_MASK -1 +# define UPDATE_MASK(RA, RS, RB) andc RA, RS, RB +#endif + +/* Simple macro to use VSX instructions in overlapping VR's. */ +#define XXVR(insn, vrt, vra, vrb) \ + insn 32+vrt, 32+vra, 32+vrb + +/* ISA 2.07B instructions are not all defined for older binutils. + Macros are defined below for these newer instructions in order + to maintain compatibility. */ + +/* Note, TX/SX is always set as VMX regs are the high 32 VSX regs. */ +#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16))) +#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) + +#define VBPERMQ(t,a,b) .long (0x1000054c \ + | ((t)<<(32-11)) \ + | ((a)<<(32-16)) \ + | ((b)<<(32-21)) ) + + /* This can be updated to power8 once the minimum version of + binutils supports power8 and the above instructions. */ + .machine power7 +EALIGN(STRSPN, 4, 0) + CALL_MCOUNT 2 + + /* Generate useful constants for later on. */ + vspltisb v1, 7 + vspltisb v2, -1 + vslb v1, v1, v1 /* 0x80 to swap high bit for vbpermq. */ + vspltisb v10, 0 + vsldoi v4, v10, v2, 2 /* 0xFFFF into vr4. */ + XXVR(xxmrgld, v4, v4, v10) /* Mask for checking matches. */ + + /* Prepare to compute 256b mask. */ + addi r4, r4, -1 + li r5, INITIAL_MASK + li r6, INITIAL_MASK + li r7, INITIAL_MASK + li r8, INITIAL_MASK + +#if USE_AS_STRCSPN + /* Ensure the null character never matches by clearing ISA bit 0 in + in r5 which is the bit which will check for it in the later usage + of vbpermq. */ + srdi r5, r5, 1 +#endif + + li r11, 1 + sldi r11, r11, 63 + + /* Start interleaved Mask computation. + This will eventually or 1's into ignored bits from vbpermq. */ + lvsr v11, 0, r3 + vspltb v11, v11, 0 /* Splat shift constant. */ + + /* Build a 256b mask in r5-r8. */ + .align 4 +L(next_needle): + lbzu r9, 1(r4) + + cmpldi cr0, r9, 0 + cmpldi cr1, r9, 128 + + /* This is a little tricky. srd only uses the first 7 bits, + and if bit 7 is set, value is always 0. So, we can + effectively shift 128b in this case. */ + xori r12, r9, 0x40 /* Invert bit 6. */ + srd r10, r11, r9 /* Mask for bits 0-63. */ + srd r12, r11, r12 /* Mask for bits 64-127. */ + + beq cr0, L(start_cmp) + + /* Now, or the value into the correct GPR. */ + bge cr1,L(needle_gt128) + UPDATE_MASK (r5, r5, r10) /* 0 - 63. */ + UPDATE_MASK (r6, r6, r12) /* 64 - 127. */ + b L(next_needle) + + .align 4 +L(needle_gt128): + UPDATE_MASK (r7, r7, r10) /* 128 - 191. */ + UPDATE_MASK (r8, r8, r12) /* 192 - 255. */ + b L(next_needle) + + + .align 4 +L(start_cmp): + /* Move and merge bitmap into 2 VRs. bpermd is slower on P8. */ + mr r0, r3 /* Save r3 for final length computation. */ + MTVRD (v5, r5) + MTVRD (v6, r6) + MTVRD (v7, r7) + MTVRD (v8, r8) + + /* Continue interleaved mask generation. */ +#ifdef __LITTLE_ENDIAN__ + vsrw v11, v2, v11 /* Note, shift ignores higher order bits. */ + vsplth v11, v11, 0 /* Only care about the high 16 bits of v10. */ +#else + vslw v11, v2, v11 /* Note, shift ignores higher order bits. */ + vsplth v11, v11, 1 /* Only care about the low 16 bits of v10. */ +#endif + lvx v0, 0, r3 /* Note, unaligned load ignores lower bits. */ + + /* Do the merging of the bitmask. */ + XXVR(xxmrghd, v5, v5, v6) + XXVR(xxmrghd, v6, v7, v8) + + /* Finish mask generation. */ + vand v11, v11, v4 /* Throwaway bits not in the mask. */ + + /* Compare the first 1-16B, while masking unwanted bytes. */ + clrrdi r3, r3, 4 /* Note, counts from qw boundaries. */ + vxor v9, v0, v1 /* Swap high bit. */ + VBPERMQ (v8, v5, v0) + VBPERMQ (v7, v6, v9) + vor v7, v7, v8 + vor v7, v7, v11 /* Ignore non-participating bytes. */ + vcmpequh. v8, v7, v4 + bnl cr6, L(done) + + addi r3, r3, 16 + + .align 4 +L(vec): + lvx v0, 0, r3 + addi r3, r3, 16 + vxor v9, v0, v1 /* Swap high bit. */ + VBPERMQ (v8, v5, v0) + VBPERMQ (v7, v6, v9) + vor v7, v7, v8 + vcmpequh. v8, v7, v4 + blt cr6, L(vec) + + addi r3, r3, -16 +L(done): + subf r3, r0, r3 + MFVRD (r10, v7) + +#ifdef __LITTLE_ENDIAN__ + addi r0, r10, 1 /* Count the trailing 1's. */ + andc r10, r10, r0 + popcntd r10, r10 +#else + xori r10, r10, 0xffff /* Count leading 1's by inverting. */ + addi r3, r3, -48 /* Account for the extra leading zeros. */ + cntlzd r10, r10 +#endif + + add r3, r3, r10 + blr + +END(STRSPN) +libc_hidden_builtin_def (STRSPN) |