aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/powerpc/powerpc64/power8
diff options
context:
space:
mode:
authorZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
committerZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
commit5046dbb4a7eba5eccfd258f92f4735c9ffc8d069 (patch)
tree4470480d904b65cf14ca524f96f79eca818c3eaf /sysdeps/powerpc/powerpc64/power8
parent199fc19d3aaaf57944ef036e15904febe877fc93 (diff)
downloadglibc-zack/build-layout-experiment.tar
glibc-zack/build-layout-experiment.tar.gz
glibc-zack/build-layout-experiment.tar.bz2
glibc-zack/build-layout-experiment.zip
Prepare for radical source tree reorganization.zack/build-layout-experiment
All top-level files and directories are moved into a temporary storage directory, REORG.TODO, except for files that will certainly still exist in their current form at top level when we're done (COPYING, COPYING.LIB, LICENSES, NEWS, README), all old ChangeLog files (which are moved to the new directory OldChangeLogs, instead), and the generated file INSTALL (which is just deleted; in the new order, there will be no generated files checked into version control).
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power8')
-rw-r--r--sysdeps/powerpc/powerpc64/power8/Implies2
-rw-r--r--sysdeps/powerpc/powerpc64/power8/Makefile3
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/Implies1
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S303
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies1
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S508
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S56
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_finitef.S1
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S61
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_isinff.S1
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S56
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_isnanf.S1
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S45
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S48
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S519
-rw-r--r--sysdeps/powerpc/powerpc64/power8/memcmp.S1447
-rw-r--r--sysdeps/powerpc/powerpc64/power8/memset.S458
-rw-r--r--sysdeps/powerpc/powerpc64/power8/multiarch/Implies1
-rw-r--r--sysdeps/powerpc/powerpc64/power8/stpcpy.S24
-rw-r--r--sysdeps/powerpc/powerpc64/power8/stpncpy.S24
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strcasecmp.S457
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c29
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strcasestr.S538
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strchr.S377
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strchrnul.S23
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strcmp.S247
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strcpy.S270
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strcspn.S20
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strlen.S301
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strncase.S20
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strncmp.S327
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strncpy.S465
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strnlen.S433
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strrchr.S464
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strspn.S202
35 files changed, 0 insertions, 7733 deletions
diff --git a/sysdeps/powerpc/powerpc64/power8/Implies b/sysdeps/powerpc/powerpc64/power8/Implies
deleted file mode 100644
index 9a5e3c7277..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/Implies
+++ /dev/null
@@ -1,2 +0,0 @@
-powerpc/powerpc64/power7/fpu
-powerpc/powerpc64/power7
diff --git a/sysdeps/powerpc/powerpc64/power8/Makefile b/sysdeps/powerpc/powerpc64/power8/Makefile
deleted file mode 100644
index 71a59529f3..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-ifeq ($(subdir),string)
-sysdep_routines += strcasestr-ppc64
-endif
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/Implies b/sysdeps/powerpc/powerpc64/power8/fpu/Implies
deleted file mode 100644
index 1187cdfb0a..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/fpu/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power7/fpu/
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S b/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S
deleted file mode 100644
index 4c42926a74..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S
+++ /dev/null
@@ -1,303 +0,0 @@
-/* Optimized expf(). PowerPC64/POWER8 version.
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-/* Short algorithm description:
- *
- * Let K = 64 (table size).
- * e^x = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y))
- * where:
- * x = m*log(2)/K + y, y in [0.0..log(2)/K]
- * m = n*K + j, m,n,j - signed integer, j in [0..K-1]
- * values of 2^(j/K) are tabulated as T[j].
- *
- * P(y) is a minimax polynomial approximation of expf(y)-1
- * on small interval [0.0..log(2)/K].
- *
- * P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as
- * z = y*y; P(y) = (P3*z + P1)*z + (P2*z + P0)*y
- *
- * Special cases:
- * expf(NaN) = NaN
- * expf(+INF) = +INF
- * expf(-INF) = 0
- * expf(x) = 1 for subnormals
- * for finite argument, only expf(0)=1 is exact
- * expf(x) overflows if x>88.7228317260742190
- * expf(x) underflows if x<-103.972076416015620
- */
-
-#define C1 0x42ad496b /* Single precision 125*log(2). */
-#define C2 0x31800000 /* Single precision 2^(-28). */
-#define SP_INF 0x7f800000 /* Single precision Inf. */
-#define SP_EXP_BIAS 0x1fc0 /* Single precision exponent bias. */
-
-#define DATA_OFFSET r9
-
-/* Implements the function
-
- float [fp1] expf (float [fp1] x) */
-
- .machine power8
-EALIGN(__ieee754_expf, 4, 0)
- addis DATA_OFFSET,r2,.Lanchor@toc@ha
- addi DATA_OFFSET,DATA_OFFSET,.Lanchor@toc@l
-
- xscvdpspn v0,v1
- mfvsrd r8,v0 /* r8 = x */
- lfd fp2,(.KLN2-.Lanchor)(DATA_OFFSET)
- lfd fp3,(.P2-.Lanchor)(DATA_OFFSET)
- rldicl r3,r8,32,33 /* r3 = |x| */
- lis r4,C1@ha /* r4 = 125*log(2) */
- ori r4,r4,C1@l
- cmpw r3,r4
- lfd fp5,(.P3-.Lanchor)(DATA_OFFSET)
- lfd fp4,(.RS-.Lanchor)(DATA_OFFSET)
- fmadd fp2,fp1,fp2,fp4 /* fp2 = x * K/log(2) + (2^23 + 2^22) */
- bge L(special_paths) /* |x| >= 125*log(2) ? */
-
- lis r4,C2@ha
- ori r4,r4,C2@l
- cmpw r3,r4
- blt L(small_args) /* |x| < 2^(-28) ? */
-
- /* Main path: here if 2^(-28) <= |x| < 125*log(2) */
- frsp fp6,fp2
- xscvdpsp v2,v2
- mfvsrd r8,v2
- mr r3,r8 /* r3 = m */
- rldicl r8,r8,32,58 /* r8 = j */
- lfs fp4,(.SP_RS-.Lanchor)(DATA_OFFSET)
- fsubs fp2,fp6,fp4 /* fp2 = m = x * K/log(2) */
- srdi r3,r3,32
- clrrwi r3,r3,6 /* r3 = n */
- lfd fp6,(.NLN2K-.Lanchor)(DATA_OFFSET)
- fmadd fp0,fp2,fp6,fp1 /* fp0 = y = x - m*log(2)/K */
- fmul fp2,fp0,fp0 /* fp2 = z = y^2 */
- lfd fp4,(.P1-.Lanchor)(DATA_OFFSET)
- lfd fp6,(.P0-.Lanchor)(DATA_OFFSET)
- lis r4,SP_EXP_BIAS@ha
- ori r4,r4,SP_EXP_BIAS@l
- add r3,r3,r4
- rldic r3,r3,49,1 /* r3 = 2^n */
- fmadd fp4,fp5,fp2,fp4 /* fp4 = P3 * z + P1 */
- fmadd fp6,fp3,fp2,fp6 /* fp6 = P2 * z + P0 */
- mtvsrd v1,r3
- xscvspdp v1,v1
- fmul fp4,fp4,fp2 /* fp4 = (P3 * z + P1)*z */
- fmadd fp0,fp0,fp6,fp4 /* fp0 = P(y) */
- sldi r8,r8,3 /* Access doublewords from T[j]. */
- addi r6,DATA_OFFSET,(.Ttable-.Lanchor)
- lfdx fp3,r6,r8
- fmadd fp0,fp0,fp3,fp3 /* fp0 = T[j] * (1 + P(y)) */
- fmul fp1,fp1,fp0 /* fp1 = 2^n * T[j] * (1 + P(y)) */
- frsp fp1,fp1
- blr
-
- .align 4
-/* x is either underflow, overflow, infinite or NaN. */
-L(special_paths):
- srdi r8,r8,32
- rlwinm r8,r8,3,29,29 /* r8 = 0, if x positive.
- r8 = 4, otherwise. */
- addi r6,DATA_OFFSET,(.SPRANGE-.Lanchor)
- lwzx r4,r6,r8 /* r4 = .SPRANGE[signbit(x)] */
- cmpw r3,r4
- /* |x| <= .SPRANGE[signbit(x)] */
- ble L(near_under_or_overflow)
-
- lis r4,SP_INF@ha
- ori r4,r4,SP_INF@l
- cmpw r3,r4
- bge L(arg_inf_or_nan) /* |x| > Infinite ? */
-
- addi r6,DATA_OFFSET,(.SPLARGE_SMALL-.Lanchor)
- lfsx fp1,r6,r8
- fmuls fp1,fp1,fp1
- blr
-
-
- .align 4
-L(small_args):
- /* expf(x) = 1.0, where |x| < |2^(-28)| */
- lfs fp2,(.SPone-.Lanchor)(DATA_OFFSET)
- fadds fp1,fp1,fp2
- blr
-
-
- .align 4
-L(arg_inf_or_nan:)
- bne L(arg_nan)
-
- /* expf(+INF) = +INF
- expf(-INF) = 0 */
- addi r6,DATA_OFFSET,(.INF_ZERO-.Lanchor)
- lfsx fp1,r6,r8
- blr
-
-
- .align 4
-L(arg_nan):
- /* expf(NaN) = NaN */
- fadd fp1,fp1,fp1
- frsp fp1,fp1
- blr
-
- .align 4
-L(near_under_or_overflow):
- frsp fp6,fp2
- xscvdpsp v2,v2
- mfvsrd r8,v2
- mr r3,r8 /* r3 = m */
- rldicl r8,r8,32,58 /* r8 = j */
- lfs fp4,(.SP_RS-.Lanchor)(DATA_OFFSET)
- fsubs fp2,fp6,fp4 /* fp2 = m = x * K/log(2) */
- srdi r3,r3,32
- clrrwi r3,r3,6 /* r3 = n */
- lfd fp6,(.NLN2K-.Lanchor)(DATA_OFFSET)
- fmadd fp0,fp2,fp6,fp1 /* fp0 = y = x - m*log(2)/K */
- fmul fp2,fp0,fp0 /* fp2 = z = y^2 */
- lfd fp4,(.P1-.Lanchor)(DATA_OFFSET)
- lfd fp6,(.P0-.Lanchor)(DATA_OFFSET)
- ld r4,(.DP_EXP_BIAS-.Lanchor)(DATA_OFFSET)
- add r3,r3,r4
- rldic r3,r3,46,1 /* r3 = 2 */
- fmadd fp4,fp5,fp2,fp4 /* fp4 = P3 * z + P1 */
- fmadd fp6,fp3,fp2,fp6 /* fp6 = P2 * z + P0 */
- mtvsrd v1,r3
- fmul fp4,fp4,fp2 /* fp4 = (P3*z + P1)*z */
- fmadd fp0,fp0,fp6,fp4 /* fp0 = P(y) */
- sldi r8,r8,3 /* Access doublewords from T[j]. */
- addi r6,DATA_OFFSET,(.Ttable-.Lanchor)
- lfdx fp3,r6,r8
- fmadd fp0,fp0,fp3,fp3 /* fp0 = T[j] * (1 + T[j]) */
- fmul fp1,fp1,fp0 /* fp1 = 2^n * T[j] * (1 + T[j]) */
- frsp fp1,fp1
- blr
-END(__ieee754_expf)
-
- .section .rodata, "a",@progbits
-.Lanchor:
- .balign 8
-/* Table T[j] = 2^(j/K). Double precision. */
-.Ttable:
- .8byte 0x3ff0000000000000
- .8byte 0x3ff02c9a3e778061
- .8byte 0x3ff059b0d3158574
- .8byte 0x3ff0874518759bc8
- .8byte 0x3ff0b5586cf9890f
- .8byte 0x3ff0e3ec32d3d1a2
- .8byte 0x3ff11301d0125b51
- .8byte 0x3ff1429aaea92de0
- .8byte 0x3ff172b83c7d517b
- .8byte 0x3ff1a35beb6fcb75
- .8byte 0x3ff1d4873168b9aa
- .8byte 0x3ff2063b88628cd6
- .8byte 0x3ff2387a6e756238
- .8byte 0x3ff26b4565e27cdd
- .8byte 0x3ff29e9df51fdee1
- .8byte 0x3ff2d285a6e4030b
- .8byte 0x3ff306fe0a31b715
- .8byte 0x3ff33c08b26416ff
- .8byte 0x3ff371a7373aa9cb
- .8byte 0x3ff3a7db34e59ff7
- .8byte 0x3ff3dea64c123422
- .8byte 0x3ff4160a21f72e2a
- .8byte 0x3ff44e086061892d
- .8byte 0x3ff486a2b5c13cd0
- .8byte 0x3ff4bfdad5362a27
- .8byte 0x3ff4f9b2769d2ca7
- .8byte 0x3ff5342b569d4f82
- .8byte 0x3ff56f4736b527da
- .8byte 0x3ff5ab07dd485429
- .8byte 0x3ff5e76f15ad2148
- .8byte 0x3ff6247eb03a5585
- .8byte 0x3ff6623882552225
- .8byte 0x3ff6a09e667f3bcd
- .8byte 0x3ff6dfb23c651a2f
- .8byte 0x3ff71f75e8ec5f74
- .8byte 0x3ff75feb564267c9
- .8byte 0x3ff7a11473eb0187
- .8byte 0x3ff7e2f336cf4e62
- .8byte 0x3ff82589994cce13
- .8byte 0x3ff868d99b4492ed
- .8byte 0x3ff8ace5422aa0db
- .8byte 0x3ff8f1ae99157736
- .8byte 0x3ff93737b0cdc5e5
- .8byte 0x3ff97d829fde4e50
- .8byte 0x3ff9c49182a3f090
- .8byte 0x3ffa0c667b5de565
- .8byte 0x3ffa5503b23e255d
- .8byte 0x3ffa9e6b5579fdbf
- .8byte 0x3ffae89f995ad3ad
- .8byte 0x3ffb33a2b84f15fb
- .8byte 0x3ffb7f76f2fb5e47
- .8byte 0x3ffbcc1e904bc1d2
- .8byte 0x3ffc199bdd85529c
- .8byte 0x3ffc67f12e57d14b
- .8byte 0x3ffcb720dcef9069
- .8byte 0x3ffd072d4a07897c
- .8byte 0x3ffd5818dcfba487
- .8byte 0x3ffda9e603db3285
- .8byte 0x3ffdfc97337b9b5f
- .8byte 0x3ffe502ee78b3ff6
- .8byte 0x3ffea4afa2a490da
- .8byte 0x3ffefa1bee615a27
- .8byte 0x3fff50765b6e4540
- .8byte 0x3fffa7c1819e90d8
-
-.KLN2:
- .8byte 0x40571547652b82fe /* Double precision K/log(2). */
-
-/* Double precision polynomial coefficients. */
-.P0:
- .8byte 0x3fefffffffffe7c6
-.P1:
- .8byte 0x3fe00000008d6118
-.P2:
- .8byte 0x3fc55550da752d4f
-.P3:
- .8byte 0x3fa56420eb78fa85
-
-.RS:
- .8byte 0x4168000000000000 /* Double precision 2^23 + 2^22. */
-.NLN2K:
- .8byte 0xbf862e42fefa39ef /* Double precision -log(2)/K. */
-.DP_EXP_BIAS:
- .8byte 0x000000000000ffc0 /* Double precision exponent bias. */
-
- .balign 4
-.SPone:
- .4byte 0x3f800000 /* Single precision 1.0. */
-.SP_RS:
- .4byte 0x4b400000 /* Single precision 2^23 + 2^22. */
-
-.SPRANGE: /* Single precision overflow/underflow bounds. */
- .4byte 0x42b17217 /* if x>this bound, then result overflows. */
- .4byte 0x42cff1b4 /* if x<this bound, then result underflows. */
-
-.SPLARGE_SMALL:
- .4byte 0x71800000 /* 2^100. */
- .4byte 0x0d800000 /* 2^-100. */
-
-.INF_ZERO:
- .4byte 0x7f800000 /* Single precision Inf. */
- .4byte 0 /* Single precision zero. */
-
-strong_alias (__ieee754_expf, __expf_finite)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies b/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies
deleted file mode 100644
index 7fd86fdf87..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power7/fpu/multiarch
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S
deleted file mode 100644
index 8dfa0076e0..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S
+++ /dev/null
@@ -1,508 +0,0 @@
-/* Optimized cosf(). PowerPC64/POWER8 version.
- Copyright (C) 2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#define _ERRNO_H 1
-#include <bits/errno.h>
-
-#define FRAMESIZE (FRAME_MIN_SIZE+16)
-
-#define FLOAT_EXPONENT_SHIFT 23
-#define FLOAT_EXPONENT_BIAS 127
-#define INTEGER_BITS 3
-
-#define PI_4 0x3f490fdb /* PI/4 */
-#define NINEPI_4 0x40e231d6 /* 9 * PI/4 */
-#define TWO_PN5 0x3d000000 /* 2^-5 */
-#define TWO_PN27 0x32000000 /* 2^-27 */
-#define INFINITY 0x7f800000
-#define TWO_P23 0x4b000000 /* 2^23 */
-#define FX_FRACTION_1_28 0x9249250 /* 0x100000000 / 28 + 1 */
-
- /* Implements the function
-
- float [fp1] cosf (float [fp1] x) */
-
- .machine power8
-EALIGN(__cosf, 4, 0)
- addis r9,r2,L(anchor)@toc@ha
- addi r9,r9,L(anchor)@toc@l
-
- lis r4,PI_4@h
- ori r4,r4,PI_4@l
-
- xscvdpspn v0,v1
- mfvsrd r8,v0
- rldicl r3,r8,32,33 /* Remove sign bit. */
-
- cmpw r3,r4
- bge L(greater_or_equal_pio4)
-
- lis r4,TWO_PN5@h
- ori r4,r4,TWO_PN5@l
-
- cmpw r3,r4
- blt L(less_2pn5)
-
- /* Chebyshev polynomial of the form:
- * 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */
-
- lfd fp9,(L(C0)-L(anchor))(r9)
- lfd fp10,(L(C1)-L(anchor))(r9)
- lfd fp11,(L(C2)-L(anchor))(r9)
- lfd fp12,(L(C3)-L(anchor))(r9)
- lfd fp13,(L(C4)-L(anchor))(r9)
-
- fmul fp2,fp1,fp1 /* x^2 */
- lfd fp3,(L(DPone)-L(anchor))(r9)
-
- fmadd fp4,fp2,fp13,fp12 /* C3+x^2*C4 */
- fmadd fp4,fp2,fp4,fp11 /* C2+x^2*(C3+x^2*C4) */
- fmadd fp4,fp2,fp4,fp10 /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */
- fmadd fp4,fp2,fp4,fp9 /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */
- fmadd fp1,fp2,fp4,fp3 /* 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */
- frsp fp1,fp1 /* Round to single precision. */
-
- blr
-
- .balign 16
-L(greater_or_equal_pio4):
- lis r4,NINEPI_4@h
- ori r4,r4,NINEPI_4@l
- cmpw r3,r4
- bge L(greater_or_equal_9pio4)
-
- /* Calculate quotient of |x|/(PI/4). */
- lfd fp2,(L(invpio4)-L(anchor))(r9)
- fabs fp1,fp1 /* |x| */
- fmul fp2,fp1,fp2 /* |x|/(PI/4) */
- fctiduz fp2,fp2
- mfvsrd r3,v2 /* n = |x| mod PI/4 */
-
- /* Now use that quotient to find |x| mod (PI/2). */
- addi r7,r3,1
- rldicr r5,r7,2,60 /* ((n+1) >> 1) << 3 */
- addi r6,r9,(L(pio2_table)-L(anchor))
- lfdx fp4,r5,r6
- fsub fp1,fp1,fp4
-
- .balign 16
-L(reduced):
- /* Now we are in the range -PI/4 to PI/4. */
-
- /* Work out if we are in a positive or negative primary interval. */
- addi r7,r7,2
- rldicl r4,r7,62,63 /* ((n+3) >> 2) & 1 */
-
- /* Load a 1.0 or -1.0. */
- addi r5,r9,(L(ones)-L(anchor))
- sldi r4,r4,3
- lfdx fp0,r4,r5
-
- /* Are we in the primary interval of sin or cos? */
- andi. r4,r7,0x2
- bne L(cos)
-
- /* Chebyshev polynomial of the form:
- x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */
-
- lfd fp9,(L(S0)-L(anchor))(r9)
- lfd fp10,(L(S1)-L(anchor))(r9)
- lfd fp11,(L(S2)-L(anchor))(r9)
- lfd fp12,(L(S3)-L(anchor))(r9)
- lfd fp13,(L(S4)-L(anchor))(r9)
-
- fmul fp2,fp1,fp1 /* x^2 */
- fmul fp3,fp2,fp1 /* x^3 */
-
- fmadd fp4,fp2,fp13,fp12 /* S3+x^2*S4 */
- fmadd fp4,fp2,fp4,fp11 /* S2+x^2*(S3+x^2*S4) */
- fmadd fp4,fp2,fp4,fp10 /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */
- fmadd fp4,fp2,fp4,fp9 /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */
- fmadd fp4,fp3,fp4,fp1 /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */
- fmul fp4,fp4,fp0 /* Add in the sign. */
- frsp fp1,fp4 /* Round to single precision. */
-
- blr
-
- .balign 16
-L(cos):
- /* Chebyshev polynomial of the form:
- 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */
-
- lfd fp9,(L(C0)-L(anchor))(r9)
- lfd fp10,(L(C1)-L(anchor))(r9)
- lfd fp11,(L(C2)-L(anchor))(r9)
- lfd fp12,(L(C3)-L(anchor))(r9)
- lfd fp13,(L(C4)-L(anchor))(r9)
-
- fmul fp2,fp1,fp1 /* x^2 */
- lfd fp3,(L(DPone)-L(anchor))(r9)
-
- fmadd fp4,fp2,fp13,fp12 /* C3+x^2*C4 */
- fmadd fp4,fp2,fp4,fp11 /* C2+x^2*(C3+x^2*C4) */
- fmadd fp4,fp2,fp4,fp10 /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */
- fmadd fp4,fp2,fp4,fp9 /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */
- fmadd fp4,fp2,fp4,fp3 /* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */
- fmul fp4,fp4,fp0 /* Add in the sign. */
- frsp fp1,fp4 /* Round to single precision. */
-
- blr
-
- .balign 16
-L(greater_or_equal_9pio4):
- lis r4,INFINITY@h
- ori r4,r4,INFINITY@l
- cmpw r3,r4
- bge L(inf_or_nan)
-
- lis r4,TWO_P23@h
- ori r4,r4,TWO_P23@l
- cmpw r3,r4
- bge L(greater_or_equal_2p23)
-
- fabs fp1,fp1 /* |x| */
-
- /* Calculate quotient of |x|/(PI/4). */
- lfd fp2,(L(invpio4)-L(anchor))(r9)
-
- lfd fp3,(L(DPone)-L(anchor))(r9)
- lfd fp4,(L(DPhalf)-L(anchor))(r9)
- fmul fp2,fp1,fp2 /* |x|/(PI/4) */
- friz fp2,fp2 /* n = floor(|x|/(PI/4)) */
-
- /* Calculate (n + 1) / 2. */
- fadd fp2,fp2,fp3 /* n + 1 */
- fmul fp3,fp2,fp4 /* (n + 1) / 2 */
- friz fp3,fp3
-
- lfd fp4,(L(pio2hi)-L(anchor))(r9)
- lfd fp5,(L(pio2lo)-L(anchor))(r9)
-
- fmul fp6,fp4,fp3
- fadd fp6,fp6,fp1
- fmadd fp1,fp5,fp3,fp6
-
- fctiduz fp2,fp2
- mfvsrd r7,v2 /* n + 1 */
-
- b L(reduced)
-
- .balign 16
-L(inf_or_nan):
- bne L(skip_errno_setting) /* Is a NAN? */
-
- /* We delayed the creation of the stack frame, as well as the saving of
- the link register, because only at this point, we are sure that
- doing so is actually needed. */
-
- stfd fp1,-8(r1)
-
- /* Save the link register. */
- mflr r0
- std r0,16(r1)
- cfi_offset(lr, 16)
-
- /* Create the stack frame. */
- stdu r1,-FRAMESIZE(r1)
- cfi_adjust_cfa_offset(FRAMESIZE)
-
- bl JUMPTARGET(__errno_location)
- nop
-
- /* Restore the stack frame. */
- addi r1,r1,FRAMESIZE
- cfi_adjust_cfa_offset(-FRAMESIZE)
- /* Restore the link register. */
- ld r0,16(r1)
- mtlr r0
-
- lfd fp1,-8(r1)
-
- /* errno = EDOM */
- li r4,EDOM
- stw r4,0(r3)
-
-L(skip_errno_setting):
- fsub fp1,fp1,fp1 /* x - x */
- blr
-
- .balign 16
-L(greater_or_equal_2p23):
- fabs fp1,fp1
-
- srwi r4,r3,FLOAT_EXPONENT_SHIFT
- subi r4,r4,FLOAT_EXPONENT_BIAS
-
- /* We reduce the input modulo pi/4, so we need 3 bits of integer
- to determine where in 2*pi we are. Index into our array
- accordingly. */
- addi r4,r4,INTEGER_BITS
-
- /* To avoid an expensive divide, for the range we care about (0 - 127)
- we can transform x/28 into:
-
- x/28 = (x * ((0x100000000 / 28) + 1)) >> 32
-
- mulhwu returns the top 32 bits of the 64 bit result, doing the
- shift for us in the same instruction. The top 32 bits are undefined,
- so we have to mask them. */
-
- lis r6,FX_FRACTION_1_28@h
- ori r6,r6,FX_FRACTION_1_28@l
- mulhwu r5,r4,r6
- clrldi r5,r5,32
-
- /* Get our pointer into the invpio4_table array. */
- sldi r4,r5,3
- addi r6,r9,(L(invpio4_table)-L(anchor))
- add r4,r4,r6
-
- lfd fp2,0(r4)
- lfd fp3,8(r4)
- lfd fp4,16(r4)
- lfd fp5,24(r4)
-
- fmul fp6,fp2,fp1
- fmul fp7,fp3,fp1
- fmul fp8,fp4,fp1
- fmul fp9,fp5,fp1
-
- /* Mask off larger integer bits in highest double word that we don't
- care about to avoid losing precision when combining with smaller
- values. */
- fctiduz fp10,fp6
- mfvsrd r7,v10
- rldicr r7,r7,0,(63-INTEGER_BITS)
- mtvsrd v10,r7
- fcfidu fp10,fp10 /* Integer bits. */
-
- fsub fp6,fp6,fp10 /* highest -= integer bits */
-
- /* Work out the integer component, rounded down. Use the top two
- limbs for this. */
- fadd fp10,fp6,fp7 /* highest + higher */
-
- fctiduz fp10,fp10
- mfvsrd r7,v10
- andi. r0,r7,1
- fcfidu fp10,fp10
-
- /* Subtract integer component from highest limb. */
- fsub fp12,fp6,fp10
-
- beq L(even_integer)
-
- /* Our integer component is odd, so we are in the -PI/4 to 0 primary
- region. We need to shift our result down by PI/4, and to do this
- in the mod (4/PI) space we simply subtract 1. */
- lfd fp11,(L(DPone)-L(anchor))(r9)
- fsub fp12,fp12,fp11
-
- /* Now add up all the limbs in order. */
- fadd fp12,fp12,fp7
- fadd fp12,fp12,fp8
- fadd fp12,fp12,fp9
-
- /* And finally multiply by pi/4. */
- lfd fp13,(L(pio4)-L(anchor))(r9)
- fmul fp1,fp12,fp13
-
- addi r7,r7,1
- b L(reduced)
-
-L(even_integer):
- lfd fp11,(L(DPone)-L(anchor))(r9)
-
- /* Now add up all the limbs in order. */
- fadd fp12,fp12,fp7
- fadd fp12,r12,fp8
- fadd fp12,r12,fp9
-
- /* We need to check if the addition of all the limbs resulted in us
- overflowing 1.0. */
- fcmpu 0,fp12,fp11
- bgt L(greater_than_one)
-
- /* And finally multiply by pi/4. */
- lfd fp13,(L(pio4)-L(anchor))(r9)
- fmul fp1,fp12,fp13
-
- addi r7,r7,1
- b L(reduced)
-
-L(greater_than_one):
- /* We did overflow 1.0 when adding up all the limbs. Add 1.0 to our
- integer, and subtract 1.0 from our result. Since that makes the
- integer component odd, we need to subtract another 1.0 as
- explained above. */
- addi r7,r7,1
-
- lfd fp11,(L(DPtwo)-L(anchor))(r9)
- fsub fp12,fp12,fp11
-
- /* And finally multiply by pi/4. */
- lfd fp13,(L(pio4)-L(anchor))(r9)
- fmul fp1,fp12,fp13
-
- addi r7,r7,1
- b L(reduced)
-
- .balign 16
-L(less_2pn5):
- lis r4,TWO_PN27@h
- ori r4,r4,TWO_PN27@l
-
- cmpw r3,r4
- blt L(less_2pn27)
-
- /* A simpler Chebyshev approximation is close enough for this range:
- 1.0+x^2*(CC0+x^3*CC1). */
-
- lfd fp10,(L(CC0)-L(anchor))(r9)
- lfd fp11,(L(CC1)-L(anchor))(r9)
-
- fmul fp2,fp1,fp1 /* x^2 */
- fmul fp3,fp2,fp1 /* x^3 */
- lfd fp1,(L(DPone)-L(anchor))(r9)
-
- fmadd fp4,fp3,fp11,fp10 /* CC0+x^3*CC1 */
- fmadd fp1,fp2,fp4,fp1 /* 1.0+x^2*(CC0+x^3*CC1) */
-
- frsp fp1,fp1 /* Round to single precision. */
-
- blr
-
- .balign 16
-L(less_2pn27):
- /* Handle some special cases:
-
- cosf(subnormal) raises inexact
- cosf(min_normalized) raises inexact
- cosf(normalized) raises inexact. */
-
- lfd fp2,(L(DPone)-L(anchor))(r9)
-
- fabs fp1,fp1 /* |x| */
- fsub fp1,fp2,fp1 /* 1.0-|x| */
-
- frsp fp1,fp1
-
- blr
-
-END (__cosf)
-
- .section .rodata, "a"
-
- .balign 8
-
-L(anchor):
-
- /* Chebyshev constants for sin, range -PI/4 - PI/4. */
-L(S0): .8byte 0xbfc5555555551cd9
-L(S1): .8byte 0x3f81111110c2688b
-L(S2): .8byte 0xbf2a019f8b4bd1f9
-L(S3): .8byte 0x3ec71d7264e6b5b4
-L(S4): .8byte 0xbe5a947e1674b58a
-
- /* Chebyshev constants for cos, range 2^-27 - 2^-5. */
-L(CC0): .8byte 0xbfdfffffff5cc6fd
-L(CC1): .8byte 0x3fa55514b178dac5
-
- /* Chebyshev constants for cos, range -PI/4 - PI/4. */
-L(C0): .8byte 0xbfdffffffffe98ae
-L(C1): .8byte 0x3fa55555545c50c7
-L(C2): .8byte 0xbf56c16b348b6874
-L(C3): .8byte 0x3efa00eb9ac43cc0
-L(C4): .8byte 0xbe923c97dd8844d7
-
-L(invpio2):
- .8byte 0x3fe45f306dc9c883 /* 2/PI */
-
-L(invpio4):
- .8byte 0x3ff45f306dc9c883 /* 4/PI */
-
-L(invpio4_table):
- .8byte 0x0000000000000000
- .8byte 0x3ff45f306c000000
- .8byte 0x3e3c9c882a000000
- .8byte 0x3c54fe13a8000000
- .8byte 0x3aaf47d4d0000000
- .8byte 0x38fbb81b6c000000
- .8byte 0x3714acc9e0000000
- .8byte 0x3560e4107c000000
- .8byte 0x33bca2c756000000
- .8byte 0x31fbd778ac000000
- .8byte 0x300b7246e0000000
- .8byte 0x2e5d2126e8000000
- .8byte 0x2c97003248000000
- .8byte 0x2ad77504e8000000
- .8byte 0x290921cfe0000000
- .8byte 0x274deb1cb0000000
- .8byte 0x25829a73e0000000
- .8byte 0x23fd1046be000000
- .8byte 0x2224baed10000000
- .8byte 0x20709d338e000000
- .8byte 0x1e535a2f80000000
- .8byte 0x1cef904e64000000
- .8byte 0x1b0d639830000000
- .8byte 0x1964ce7d24000000
- .8byte 0x17b908bf16000000
-
-L(pio4):
- .8byte 0x3fe921fb54442d18 /* PI/4 */
-
-/* PI/2 as a sum of two doubles. We only use 32 bits of the upper limb
- to avoid losing significant bits when multiplying with up to
- (2^22)/(pi/2). */
-L(pio2hi):
- .8byte 0xbff921fb54400000
-
-L(pio2lo):
- .8byte 0xbdd0b4611a626332
-
-L(pio2_table):
- .8byte 0
- .8byte 0x3ff921fb54442d18 /* 1 * PI/2 */
- .8byte 0x400921fb54442d18 /* 2 * PI/2 */
- .8byte 0x4012d97c7f3321d2 /* 3 * PI/2 */
- .8byte 0x401921fb54442d18 /* 4 * PI/2 */
- .8byte 0x401f6a7a2955385e /* 5 * PI/2 */
- .8byte 0x4022d97c7f3321d2 /* 6 * PI/2 */
- .8byte 0x4025fdbbe9bba775 /* 7 * PI/2 */
- .8byte 0x402921fb54442d18 /* 8 * PI/2 */
- .8byte 0x402c463abeccb2bb /* 9 * PI/2 */
- .8byte 0x402f6a7a2955385e /* 10 * PI/2 */
-
-L(small):
- .8byte 0x3cd0000000000000 /* 2^-50 */
-
-L(ones):
- .8byte 0x3ff0000000000000 /* +1.0 */
- .8byte 0xbff0000000000000 /* -1.0 */
-
-L(DPhalf):
- .8byte 0x3fe0000000000000 /* 0.5 */
-
-L(DPone):
- .8byte 0x3ff0000000000000 /* 1.0 */
-
-L(DPtwo):
- .8byte 0x4000000000000000 /* 2.0 */
-
-weak_alias(__cosf, cosf)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
deleted file mode 100644
index fcdcb60293..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
+++ /dev/null
@@ -1,56 +0,0 @@
-/* isfinite(). PowerPC64/POWER8 version.
- Copyright (C) 2014-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <math_ldbl_opt.h>
-
-#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */
-
-/* int [r3] __finite ([fp1] x) */
-
-EALIGN (__finite, 4, 0)
- CALL_MCOUNT 0
- MFVSRD_R3_V1
- lis r9,0x8010
- clrldi r3,r3,1 /* r3 = r3 & 0x8000000000000000 */
- rldicr r9,r9,32,31 /* r9 = (r9 << 32) & 0xffffffff */
- add r3,r3,r9
- rldicl r3,r3,1,63
- blr
-END (__finite)
-
-hidden_def (__finite)
-weak_alias (__finite, finite)
-
-/* It turns out that the 'double' version will also always work for
- single-precision. */
-strong_alias (__finite, __finitef)
-hidden_def (__finitef)
-weak_alias (__finitef, finitef)
-
-#if IS_IN (libm)
-# if LONG_DOUBLE_COMPAT (libm, GLIBC_2_0)
-compat_symbol (libm, __finite, __finitel, GLIBC_2_0)
-compat_symbol (libm, finite, finitel, GLIBC_2_0)
-# endif
-#else
-# if LONG_DOUBLE_COMPAT (libc, GLIBC_2_0)
-compat_symbol (libc, __finite, __finitel, GLIBC_2_0);
-compat_symbol (libc, finite, finitel, GLIBC_2_0);
-# endif
-#endif
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_finitef.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_finitef.S
deleted file mode 100644
index 54bd94176d..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_finitef.S
+++ /dev/null
@@ -1 +0,0 @@
-/* This function uses the same code as s_finite.S. */
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
deleted file mode 100644
index 32814e4525..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
+++ /dev/null
@@ -1,61 +0,0 @@
-/* isinf(). PowerPC64/POWER8 version.
- Copyright (C) 2014-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <math_ldbl_opt.h>
-
-#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */
-
-/* int [r3] __isinf([fp1] x) */
-
-EALIGN (__isinf, 4, 0)
- CALL_MCOUNT 0
- MFVSRD_R3_V1
- lis r9,0x7ff0 /* r9 = 0x7ff0 */
- rldicl r10,r3,0,1 /* r10 = r3 & (0x8000000000000000) */
- sldi r9,r9,32 /* r9 = r9 << 52 */
- cmpd cr7,r10,r9 /* fp1 & 0x7ff0000000000000 ? */
- beq cr7,L(inf)
- li r3,0 /* Not inf */
- blr
-L(inf):
- sradi r3,r3,63 /* r3 = r3 >> 63 */
- ori r3,r3,1 /* r3 = r3 | 0x1 */
- blr
-END (__isinf)
-
-hidden_def (__isinf)
-weak_alias (__isinf, isinf)
-
-/* It turns out that the 'double' version will also always work for
- single-precision. */
-strong_alias (__isinf, __isinff)
-hidden_def (__isinff)
-weak_alias (__isinff, isinff)
-
-#ifdef NO_LONG_DOUBLE
-strong_alias (__isinf, __isinfl)
-weak_alias (__isinf, isinfl)
-#endif
-
-#if !IS_IN (libm)
-# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
-compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0);
-compat_symbol (libc, isinf, isinfl, GLIBC_2_0);
-# endif
-#endif
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinff.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isinff.S
deleted file mode 100644
index be759e091e..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinff.S
+++ /dev/null
@@ -1 +0,0 @@
-/* This function uses the same code as s_isinf.S. */
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
deleted file mode 100644
index af52e502b7..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
+++ /dev/null
@@ -1,56 +0,0 @@
-/* isnan(). PowerPC64/POWER8 version.
- Copyright (C) 2014-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <math_ldbl_opt.h>
-
-#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */
-
-/* int [r3] __isnan([f1] x) */
-
-EALIGN (__isnan, 4, 0)
- CALL_MCOUNT 0
- MFVSRD_R3_V1
- lis r9,0x7ff0
- clrldi r3,r3,1 /* r3 = r3 & 0x8000000000000000 */
- rldicr r9,r9,32,31 /* r9 = (r9 << 32) & 0xffffffff */
- subf r3,r3,r9
- rldicl r3,r3,1,63
- blr
-END (__isnan)
-
-hidden_def (__isnan)
-weak_alias (__isnan, isnan)
-
-/* It turns out that the 'double' version will also always work for
- single-precision. */
-strong_alias (__isnan, __isnanf)
-hidden_def (__isnanf)
-weak_alias (__isnanf, isnanf)
-
-#ifdef NO_LONG_DOUBLE
-strong_alias (__isnan, __isnanl)
-weak_alias (__isnan, isnanl)
-#endif
-
-#if !IS_IN (libm)
-# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
-compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0);
-compat_symbol (libc, isnan, isnanl, GLIBC_2_0);
-# endif
-#endif
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnanf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isnanf.S
deleted file mode 100644
index b48c85e0d3..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnanf.S
+++ /dev/null
@@ -1 +0,0 @@
-/* This function uses the same code as s_isnan.S. */
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
deleted file mode 100644
index aa180b6901..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Round double to long int. POWER8 PowerPC64 version.
- Copyright (C) 2014-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <math_ldbl_opt.h>
-
-#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */
-
-/* long long int[r3] __llrint (double x[fp1]) */
-ENTRY (__llrint)
- CALL_MCOUNT 0
- fctid fp1,fp1
- MFVSRD_R3_V1
- blr
-END (__llrint)
-
-strong_alias (__llrint, __lrint)
-weak_alias (__llrint, llrint)
-weak_alias (__lrint, lrint)
-
-#ifdef NO_LONG_DOUBLE
-strong_alias (__llrint, __llrintl)
-weak_alias (__llrint, llrintl)
-strong_alias (__lrint, __lrintl)
-weak_alias (__lrint, lrintl)
-#endif
-#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_1)
-compat_symbol (libm, __llrint, llrintl, GLIBC_2_1)
-compat_symbol (libm, __lrint, lrintl, GLIBC_2_1)
-#endif
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
deleted file mode 100644
index 043fc6a089..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
+++ /dev/null
@@ -1,48 +0,0 @@
-/* llround function. POWER8 PowerPC64 version.
- Copyright (C) 2014-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <endian.h>
-#include <math_ldbl_opt.h>
-
-#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */
-
-/* long long [r3] llround (float x [fp1]) */
-
-ENTRY (__llround)
- CALL_MCOUNT 0
- frin fp1,fp1 /* Round to nearest +-0.5. */
- fctidz fp1,fp1 /* Convert To Integer DW round toward 0. */
- MFVSRD_R3_V1
- blr
-END (__llround)
-
-strong_alias (__llround, __lround)
-weak_alias (__llround, llround)
-weak_alias (__lround, lround)
-
-#ifdef NO_LONG_DOUBLE
-weak_alias (__llround, llroundl)
-strong_alias (__llround, __llroundl)
-weak_alias (__lround, lroundl)
-strong_alias (__lround, __lroundl)
-#endif
-#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_1)
-compat_symbol (libm, __llround, llroundl, GLIBC_2_1)
-compat_symbol (libm, __lround, lroundl, GLIBC_2_1)
-#endif
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S
deleted file mode 100644
index fb0add3462..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S
+++ /dev/null
@@ -1,519 +0,0 @@
-/* Optimized sinf(). PowerPC64/POWER8 version.
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#define _ERRNO_H 1
-#include <bits/errno.h>
-
-#define FRAMESIZE (FRAME_MIN_SIZE+16)
-
-#define FLOAT_EXPONENT_SHIFT 23
-#define FLOAT_EXPONENT_BIAS 127
-#define INTEGER_BITS 3
-
-#define PI_4 0x3f490fdb /* PI/4 */
-#define NINEPI_4 0x40e231d6 /* 9 * PI/4 */
-#define TWO_PN5 0x3d000000 /* 2^-5 */
-#define TWO_PN27 0x32000000 /* 2^-27 */
-#define INFINITY 0x7f800000
-#define TWO_P23 0x4b000000 /* 2^27 */
-#define FX_FRACTION_1_28 0x9249250 /* 0x100000000 / 28 + 1 */
-
- /* Implements the function
-
- float [fp1] sinf (float [fp1] x) */
-
- .machine power8
-EALIGN(__sinf, 4, 0)
- addis r9,r2,L(anchor)@toc@ha
- addi r9,r9,L(anchor)@toc@l
-
- lis r4,PI_4@h
- ori r4,r4,PI_4@l
-
- xscvdpspn v0,v1
- mfvsrd r8,v0
- rldicl r3,r8,32,33 /* Remove sign bit. */
-
- cmpw r3,r4
- bge L(greater_or_equal_pio4)
-
- lis r4,TWO_PN5@h
- ori r4,r4,TWO_PN5@l
-
- cmpw r3,r4
- blt L(less_2pn5)
-
- /* Chebyshev polynomial of the form:
- * x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */
-
- lfd fp9,(L(S0)-L(anchor))(r9)
- lfd fp10,(L(S1)-L(anchor))(r9)
- lfd fp11,(L(S2)-L(anchor))(r9)
- lfd fp12,(L(S3)-L(anchor))(r9)
- lfd fp13,(L(S4)-L(anchor))(r9)
-
- fmul fp2,fp1,fp1 /* x^2 */
- fmul fp3,fp2,fp1 /* x^3 */
-
- fmadd fp4,fp2,fp13,fp12 /* S3+x^2*S4 */
- fmadd fp4,fp2,fp4,fp11 /* S2+x^2*(S3+x^2*S4) */
- fmadd fp4,fp2,fp4,fp10 /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */
- fmadd fp4,fp2,fp4,fp9 /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */
- fmadd fp1,fp3,fp4,fp1 /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */
- frsp fp1,fp1 /* Round to single precision. */
-
- blr
-
- .balign 16
-L(greater_or_equal_pio4):
- lis r4,NINEPI_4@h
- ori r4,r4,NINEPI_4@l
- cmpw r3,r4
- bge L(greater_or_equal_9pio4)
-
- /* Calculate quotient of |x|/(PI/4). */
- lfd fp2,(L(invpio4)-L(anchor))(r9)
- fabs fp1,fp1 /* |x| */
- fmul fp2,fp1,fp2 /* |x|/(PI/4) */
- fctiduz fp2,fp2
- mfvsrd r3,v2 /* n = |x| mod PI/4 */
-
- /* Now use that quotient to find |x| mod (PI/2). */
- addi r7,r3,1
- rldicr r5,r7,2,60 /* ((n+1) >> 1) << 3 */
- addi r6,r9,(L(pio2_table)-L(anchor))
- lfdx fp4,r5,r6
- fsub fp1,fp1,fp4
-
- .balign 16
-L(reduced):
- /* Now we are in the range -PI/4 to PI/4. */
-
- /* Work out if we are in a positive or negative primary interval. */
- rldicl r4,r7,62,63 /* ((n+1) >> 2) & 1 */
-
- /* We are operating on |x|, so we need to add back the original
- sign. */
- rldicl r8,r8,33,63 /* (x >> 31) & 1, ie the sign bit. */
- xor r4,r4,r8 /* 0 if result should be positive,
- 1 if negative. */
-
- /* Load a 1.0 or -1.0. */
- addi r5,r9,(L(ones)-L(anchor))
- sldi r4,r4,3
- lfdx fp0,r4,r5
-
- /* Are we in the primary interval of sin or cos? */
- andi. r4,r7,0x2
- bne L(cos)
-
- /* Chebyshev polynomial of the form:
- x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */
-
- lfd fp9,(L(S0)-L(anchor))(r9)
- lfd fp10,(L(S1)-L(anchor))(r9)
- lfd fp11,(L(S2)-L(anchor))(r9)
- lfd fp12,(L(S3)-L(anchor))(r9)
- lfd fp13,(L(S4)-L(anchor))(r9)
-
- fmul fp2,fp1,fp1 /* x^2 */
- fmul fp3,fp2,fp1 /* x^3 */
-
- fmadd fp4,fp2,fp13,fp12 /* S3+x^2*S4 */
- fmadd fp4,fp2,fp4,fp11 /* S2+x^2*(S3+x^2*S4) */
- fmadd fp4,fp2,fp4,fp10 /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */
- fmadd fp4,fp2,fp4,fp9 /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */
- fmadd fp4,fp3,fp4,fp1 /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */
- fmul fp4,fp4,fp0 /* Add in the sign. */
- frsp fp1,fp4 /* Round to single precision. */
-
- blr
-
- .balign 16
-L(cos):
- /* Chebyshev polynomial of the form:
- 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */
-
- lfd fp9,(L(C0)-L(anchor))(r9)
- lfd fp10,(L(C1)-L(anchor))(r9)
- lfd fp11,(L(C2)-L(anchor))(r9)
- lfd fp12,(L(C3)-L(anchor))(r9)
- lfd fp13,(L(C4)-L(anchor))(r9)
-
- fmul fp2,fp1,fp1 /* x^2 */
- lfd fp3,(L(DPone)-L(anchor))(r9)
-
- fmadd fp4,fp2,fp13,fp12 /* C3+x^2*C4 */
- fmadd fp4,fp2,fp4,fp11 /* C2+x^2*(C3+x^2*C4) */
- fmadd fp4,fp2,fp4,fp10 /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */
- fmadd fp4,fp2,fp4,fp9 /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */
- fmadd fp4,fp2,fp4,fp3 /* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */
- fmul fp4,fp4,fp0 /* Add in the sign. */
- frsp fp1,fp4 /* Round to single precision. */
-
- blr
-
- .balign 16
-L(greater_or_equal_9pio4):
- lis r4,INFINITY@h
- ori r4,r4,INFINITY@l
- cmpw r3,r4
- bge L(inf_or_nan)
-
- lis r4,TWO_P23@h
- ori r4,r4,TWO_P23@l
- cmpw r3,r4
- bge L(greater_or_equal_2p23)
-
- fabs fp1,fp1 /* |x| */
-
- /* Calculate quotient of |x|/(PI/4). */
- lfd fp2,(L(invpio4)-L(anchor))(r9)
-
- lfd fp3,(L(DPone)-L(anchor))(r9)
- lfd fp4,(L(DPhalf)-L(anchor))(r9)
- fmul fp2,fp1,fp2 /* |x|/(PI/4) */
- friz fp2,fp2 /* n = floor(|x|/(PI/4)) */
-
- /* Calculate (n + 1) / 2. */
- fadd fp2,fp2,fp3 /* n + 1 */
- fmul fp3,fp2,fp4 /* (n + 1) / 2 */
- friz fp3,fp3
-
- lfd fp4,(L(pio2hi)-L(anchor))(r9)
- lfd fp5,(L(pio2lo)-L(anchor))(r9)
-
- fmul fp6,fp4,fp3
- fadd fp6,fp6,fp1
- fmadd fp1,fp5,fp3,fp6
-
- fctiduz fp2,fp2
- mfvsrd r7,v2 /* n + 1 */
-
- b L(reduced)
-
- .balign 16
-L(inf_or_nan):
- bne L(skip_errno_setting) /* Is a NAN? */
-
- /* We delayed the creation of the stack frame, as well as the saving of
- the link register, because only at this point, we are sure that
- doing so is actually needed. */
-
- stfd fp1,-8(r1)
-
- /* Save the link register. */
- mflr r0
- std r0,16(r1)
- cfi_offset(lr, 16)
-
- /* Create the stack frame. */
- stdu r1,-FRAMESIZE(r1)
- cfi_adjust_cfa_offset(FRAMESIZE)
-
- bl JUMPTARGET(__errno_location)
- nop
-
- /* Restore the stack frame. */
- addi r1,r1,FRAMESIZE
- cfi_adjust_cfa_offset(-FRAMESIZE)
- /* Restore the link register. */
- ld r0,16(r1)
- mtlr r0
-
- lfd fp1,-8(r1)
-
- /* errno = EDOM */
- li r4,EDOM
- stw r4,0(r3)
-
-L(skip_errno_setting):
- fsub fp1,fp1,fp1 /* x - x */
- blr
-
- .balign 16
-L(greater_or_equal_2p23):
- fabs fp1,fp1
-
- srwi r4,r3,FLOAT_EXPONENT_SHIFT
- subi r4,r4,FLOAT_EXPONENT_BIAS
-
- /* We reduce the input modulo pi/4, so we need 3 bits of integer
- to determine where in 2*pi we are. Index into our array
- accordingly. */
- addi r4,r4,INTEGER_BITS
-
- /* To avoid an expensive divide, for the range we care about (0 - 127)
- we can transform x/28 into:
-
- x/28 = (x * ((0x100000000 / 28) + 1)) >> 32
-
- mulhwu returns the top 32 bits of the 64 bit result, doing the
- shift for us in the same instruction. The top 32 bits are undefined,
- so we have to mask them. */
-
- lis r6,FX_FRACTION_1_28@h
- ori r6,r6,FX_FRACTION_1_28@l
- mulhwu r5,r4,r6
- clrldi r5,r5,32
-
- /* Get our pointer into the invpio4_table array. */
- sldi r4,r5,3
- addi r6,r9,(L(invpio4_table)-L(anchor))
- add r4,r4,r6
-
- lfd fp2,0(r4)
- lfd fp3,8(r4)
- lfd fp4,16(r4)
- lfd fp5,24(r4)
-
- fmul fp6,fp2,fp1
- fmul fp7,fp3,fp1
- fmul fp8,fp4,fp1
- fmul fp9,fp5,fp1
-
- /* Mask off larger integer bits in highest double word that we don't
- care about to avoid losing precision when combining with smaller
- values. */
- fctiduz fp10,fp6
- mfvsrd r7,v10
- rldicr r7,r7,0,(63-INTEGER_BITS)
- mtvsrd v10,r7
- fcfidu fp10,fp10 /* Integer bits. */
-
- fsub fp6,fp6,fp10 /* highest -= integer bits */
-
- /* Work out the integer component, rounded down. Use the top two
- limbs for this. */
- fadd fp10,fp6,fp7 /* highest + higher */
-
- fctiduz fp10,fp10
- mfvsrd r7,v10
- andi. r0,r7,1
- fcfidu fp10,fp10
-
- /* Subtract integer component from highest limb. */
- fsub fp12,fp6,fp10
-
- beq L(even_integer)
-
- /* Our integer component is odd, so we are in the -PI/4 to 0 primary
- region. We need to shift our result down by PI/4, and to do this
- in the mod (4/PI) space we simply subtract 1. */
- lfd fp11,(L(DPone)-L(anchor))(r9)
- fsub fp12,fp12,fp11
-
- /* Now add up all the limbs in order. */
- fadd fp12,fp12,fp7
- fadd fp12,fp12,fp8
- fadd fp12,fp12,fp9
-
- /* And finally multiply by pi/4. */
- lfd fp13,(L(pio4)-L(anchor))(r9)
- fmul fp1,fp12,fp13
-
- addi r7,r7,1
- b L(reduced)
-
-L(even_integer):
- lfd fp11,(L(DPone)-L(anchor))(r9)
-
- /* Now add up all the limbs in order. */
- fadd fp12,fp12,fp7
- fadd fp12,r12,fp8
- fadd fp12,r12,fp9
-
- /* We need to check if the addition of all the limbs resulted in us
- overflowing 1.0. */
- fcmpu 0,fp12,fp11
- bgt L(greater_than_one)
-
- /* And finally multiply by pi/4. */
- lfd fp13,(L(pio4)-L(anchor))(r9)
- fmul fp1,fp12,fp13
-
- addi r7,r7,1
- b L(reduced)
-
-L(greater_than_one):
- /* We did overflow 1.0 when adding up all the limbs. Add 1.0 to our
- integer, and subtract 1.0 from our result. Since that makes the
- integer component odd, we need to subtract another 1.0 as
- explained above. */
- addi r7,r7,1
-
- lfd fp11,(L(DPtwo)-L(anchor))(r9)
- fsub fp12,fp12,fp11
-
- /* And finally multiply by pi/4. */
- lfd fp13,(L(pio4)-L(anchor))(r9)
- fmul fp1,fp12,fp13
-
- addi r7,r7,1
- b L(reduced)
-
- .balign 16
-L(less_2pn5):
- lis r4,TWO_PN27@h
- ori r4,r4,TWO_PN27@l
-
- cmpw r3,r4
- blt L(less_2pn27)
-
- /* A simpler Chebyshev approximation is close enough for this range:
- x+x^3*(SS0+x^2*SS1). */
-
- lfd fp10,(L(SS0)-L(anchor))(r9)
- lfd fp11,(L(SS1)-L(anchor))(r9)
-
- fmul fp2,fp1,fp1 /* x^2 */
- fmul fp3,fp2,fp1 /* x^3 */
-
- fmadd fp4,fp2,fp11,fp10 /* SS0+x^2*SS1 */
- fmadd fp1,fp3,fp4,fp1 /* x+x^3*(SS0+x^2*SS1) */
-
- frsp fp1,fp1 /* Round to single precision. */
-
- blr
-
- .balign 16
-L(less_2pn27):
- cmpwi r3,0
- beq L(zero)
-
- /* Handle some special cases:
-
- sinf(subnormal) raises inexact/underflow
- sinf(min_normalized) raises inexact/underflow
- sinf(normalized) raises inexact. */
-
- lfd fp2,(L(small)-L(anchor))(r9)
-
- fmul fp2,fp1,fp2 /* x * small */
- fsub fp1,fp1,fp2 /* x - x * small */
-
- frsp fp1,fp1
-
- blr
-
- .balign 16
-L(zero):
- blr
-
-END (__sinf)
-
- .section .rodata, "a"
-
- .balign 8
-
-L(anchor):
-
- /* Chebyshev constants for sin, range -PI/4 - PI/4. */
-L(S0): .8byte 0xbfc5555555551cd9
-L(S1): .8byte 0x3f81111110c2688b
-L(S2): .8byte 0xbf2a019f8b4bd1f9
-L(S3): .8byte 0x3ec71d7264e6b5b4
-L(S4): .8byte 0xbe5a947e1674b58a
-
- /* Chebyshev constants for sin, range 2^-27 - 2^-5. */
-L(SS0): .8byte 0xbfc555555543d49d
-L(SS1): .8byte 0x3f8110f475cec8c5
-
- /* Chebyshev constants for cos, range -PI/4 - PI/4. */
-L(C0): .8byte 0xbfdffffffffe98ae
-L(C1): .8byte 0x3fa55555545c50c7
-L(C2): .8byte 0xbf56c16b348b6874
-L(C3): .8byte 0x3efa00eb9ac43cc0
-L(C4): .8byte 0xbe923c97dd8844d7
-
-L(invpio2):
- .8byte 0x3fe45f306dc9c883 /* 2/PI */
-
-L(invpio4):
- .8byte 0x3ff45f306dc9c883 /* 4/PI */
-
-L(invpio4_table):
- .8byte 0x0000000000000000
- .8byte 0x3ff45f306c000000
- .8byte 0x3e3c9c882a000000
- .8byte 0x3c54fe13a8000000
- .8byte 0x3aaf47d4d0000000
- .8byte 0x38fbb81b6c000000
- .8byte 0x3714acc9e0000000
- .8byte 0x3560e4107c000000
- .8byte 0x33bca2c756000000
- .8byte 0x31fbd778ac000000
- .8byte 0x300b7246e0000000
- .8byte 0x2e5d2126e8000000
- .8byte 0x2c97003248000000
- .8byte 0x2ad77504e8000000
- .8byte 0x290921cfe0000000
- .8byte 0x274deb1cb0000000
- .8byte 0x25829a73e0000000
- .8byte 0x23fd1046be000000
- .8byte 0x2224baed10000000
- .8byte 0x20709d338e000000
- .8byte 0x1e535a2f80000000
- .8byte 0x1cef904e64000000
- .8byte 0x1b0d639830000000
- .8byte 0x1964ce7d24000000
- .8byte 0x17b908bf16000000
-
-L(pio4):
- .8byte 0x3fe921fb54442d18 /* PI/4 */
-
-/* PI/2 as a sum of two doubles. We only use 32 bits of the upper limb
- to avoid losing significant bits when multiplying with up to
- (2^22)/(pi/2). */
-L(pio2hi):
- .8byte 0xbff921fb54400000
-
-L(pio2lo):
- .8byte 0xbdd0b4611a626332
-
-L(pio2_table):
- .8byte 0
- .8byte 0x3ff921fb54442d18 /* 1 * PI/2 */
- .8byte 0x400921fb54442d18 /* 2 * PI/2 */
- .8byte 0x4012d97c7f3321d2 /* 3 * PI/2 */
- .8byte 0x401921fb54442d18 /* 4 * PI/2 */
- .8byte 0x401f6a7a2955385e /* 5 * PI/2 */
- .8byte 0x4022d97c7f3321d2 /* 6 * PI/2 */
- .8byte 0x4025fdbbe9bba775 /* 7 * PI/2 */
- .8byte 0x402921fb54442d18 /* 8 * PI/2 */
- .8byte 0x402c463abeccb2bb /* 9 * PI/2 */
- .8byte 0x402f6a7a2955385e /* 10 * PI/2 */
-
-L(small):
- .8byte 0x3cd0000000000000 /* 2^-50 */
-
-L(ones):
- .8byte 0x3ff0000000000000 /* +1.0 */
- .8byte 0xbff0000000000000 /* -1.0 */
-
-L(DPhalf):
- .8byte 0x3fe0000000000000 /* 0.5 */
-
-L(DPone):
- .8byte 0x3ff0000000000000 /* 1.0 */
-
-L(DPtwo):
- .8byte 0x4000000000000000 /* 2.0 */
-
-weak_alias(__sinf, sinf)
diff --git a/sysdeps/powerpc/powerpc64/power8/memcmp.S b/sysdeps/powerpc/powerpc64/power8/memcmp.S
deleted file mode 100644
index 46b9c0067a..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/memcmp.S
+++ /dev/null
@@ -1,1447 +0,0 @@
-/* Optimized memcmp implementation for POWER7/PowerPC64.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-/* int [r3] memcmp (const char *s1 [r3],
- const char *s2 [r4],
- size_t size [r5]) */
-
-/* TODO: change these to the actual instructions when the minimum required
- binutils allows it. */
-#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
-#ifndef MEMCMP
-# define MEMCMP memcmp
-#endif
- .machine power7
-EALIGN (MEMCMP, 4, 0)
- CALL_MCOUNT 3
-
-#define rRTN r3
-#define rSTR1 r3 /* First string arg. */
-#define rSTR2 r4 /* Second string arg. */
-#define rN r5 /* Max string length. */
-#define rWORD1 r6 /* Current word in s1. */
-#define rWORD2 r7 /* Current word in s2. */
-#define rWORD3 r8 /* Next word in s1. */
-#define rWORD4 r9 /* Next word in s2. */
-#define rWORD5 r10 /* Next word in s1. */
-#define rWORD6 r11 /* Next word in s2. */
-
-#define rOFF8 r20 /* 8 bytes offset. */
-#define rOFF16 r21 /* 16 bytes offset. */
-#define rOFF24 r22 /* 24 bytes offset. */
-#define rOFF32 r23 /* 24 bytes offset. */
-#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
-#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
-#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
-#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
-#define rSHR r28 /* Unaligned shift right count. */
-#define rSHL r29 /* Unaligned shift left count. */
-#define rWORD7 r30 /* Next word in s1. */
-#define rWORD8 r31 /* Next word in s2. */
-
-#define rWORD8SAVE (-8)
-#define rWORD7SAVE (-16)
-#define rOFF8SAVE (-24)
-#define rOFF16SAVE (-32)
-#define rOFF24SAVE (-40)
-#define rOFF32SAVE (-48)
-#define rSHRSAVE (-56)
-#define rSHLSAVE (-64)
-#define rWORD8SHIFTSAVE (-72)
-#define rWORD2SHIFTSAVE (-80)
-#define rWORD4SHIFTSAVE (-88)
-#define rWORD6SHIFTSAVE (-96)
-
-#ifdef __LITTLE_ENDIAN__
-# define LD ldbrx
-#else
-# define LD ldx
-#endif
-
- xor r10, rSTR2, rSTR1
- cmpldi cr6, rN, 0
- cmpldi cr1, rN, 8
- clrldi. r0, r10, 61
- clrldi r12, rSTR1, 61
- cmpldi cr5, r12, 0
- beq- cr6, L(zeroLength)
- dcbt 0, rSTR1
- dcbt 0, rSTR2
- /* If less than 8 bytes or not aligned, use the unaligned
- byte loop. */
- blt cr1, L(bytealigned)
- bne L(unalignedqw)
-/* At this point we know both strings have the same alignment and the
- compare length is at least 8 bytes. r12 contains the low order
- 3 bits of rSTR1 and cr5 contains the result of the logical compare
- of r12 to 0. If r12 == 0 then we are already double word
- aligned and can perform the DW aligned loop. */
-
- .align 4
-L(samealignment):
- or r11, rSTR2, rSTR1
- clrldi. r11, r11, 60
- beq L(qw_align)
- /* Try to align to QW else proceed to DW loop. */
- clrldi. r10, r10, 60
- bne L(DW)
- /* For the difference to reach QW alignment, load as DW. */
- clrrdi rSTR1, rSTR1, 3
- clrrdi rSTR2, rSTR2, 3
- subfic r10, r12, 8
- LD rWORD1, 0, rSTR1
- LD rWORD2, 0, rSTR2
- sldi r9, r10, 3
- subfic r9, r9, 64
- sld rWORD1, rWORD1, r9
- sld rWORD2, rWORD2, r9
- cmpld cr6, rWORD1, rWORD2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
- bne cr6, L(ret_diff)
- subf rN, r10, rN
-
- cmpld cr6, r11, r12
- bgt cr6, L(qw_align)
- LD rWORD1, 0, rSTR1
- LD rWORD2, 0, rSTR2
- cmpld cr6, rWORD1, rWORD2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
- bne cr6, L(different)
- cmpldi cr6, rN, 8
- ble cr6, L(zeroLength)
- addi rN, rN, -8
- /* Now both rSTR1 and rSTR2 are aligned to QW. */
- .align 4
-L(qw_align):
- vspltisb v0, 0
- srdi. r6, rN, 6
- li r8, 16
- li r10, 32
- li r11, 48
- ble cr0, L(lessthan64)
- mtctr r6
- vspltisb v8, 0
- vspltisb v6, 0
- /* Aligned vector loop. */
- .align 4
-L(aligned_loop):
- lvx v4, 0, rSTR1
- lvx v5, 0, rSTR2
- vcmpequb. v7, v6, v8
- bnl cr6, L(different3)
- lvx v6, rSTR1, r8
- lvx v8, rSTR2, r8
- vcmpequb. v7, v5, v4
- bnl cr6, L(different2)
- lvx v4, rSTR1, r10
- lvx v5, rSTR2, r10
- vcmpequb. v7, v6, v8
- bnl cr6, L(different3)
- lvx v6, rSTR1, r11
- lvx v8, rSTR2, r11
- vcmpequb. v7, v5, v4
- bnl cr6, L(different2)
- addi rSTR1, rSTR1, 64
- addi rSTR2, rSTR2, 64
- bdnz L(aligned_loop)
- vcmpequb. v7, v6, v8
- bnl cr6, L(different3)
- clrldi rN, rN, 58
- /* Handle remainder for aligned loop. */
- .align 4
-L(lessthan64):
- mr r9, rSTR1
- cmpdi cr6, rN, 0
- li rSTR1, 0
- blelr cr6
- lvx v4, 0, r9
- lvx v5, 0, rSTR2
- vcmpequb. v7, v5, v4
- bnl cr6, L(different1)
- addi rN, rN, -16
-
- cmpdi cr6, rN, 0
- blelr cr6
- lvx v4, r9, r8
- lvx v5, rSTR2, r8
- vcmpequb. v7, v5, v4
- bnl cr6, L(different1)
- addi rN, rN, -16
-
- cmpdi cr6, rN, 0
- blelr cr6
- lvx v4, r9, r10
- lvx v5, rSTR2, r10
- vcmpequb. v7, v5, v4
- bnl cr6, L(different1)
- addi rN, rN, -16
-
- cmpdi cr6, rN, 0
- blelr cr6
- lvx v4, r9, r11
- lvx v5, rSTR2, r11
- vcmpequb. v7, v5, v4
- bnl cr6, L(different1)
- blr
-
- /* Calculate and return the difference. */
- .align 4
-L(different1):
- cmpdi cr6, rN, 16
- bge cr6, L(different2)
- /* Discard unwanted bytes. */
-#ifdef __LITTLE_ENDIAN__
- lvsr v1, 0, rN
- vperm v4, v4, v0, v1
- vperm v5, v5, v0, v1
-#else
- lvsl v1, 0, rN
- vperm v4, v0, v4, v1
- vperm v5, v0, v5, v1
-#endif
- vcmpequb. v7, v4, v5
- li rRTN, 0
- bltlr cr6
- .align 4
-L(different2):
-#ifdef __LITTLE_ENDIAN__
- /* Reverse bytes for direct comparison. */
- lvsl v10, r0, r0
- vspltisb v8, 15
- vsububm v9, v8, v10
- vperm v4, v4, v0, v9
- vperm v5, v5, v0, v9
-#endif
- MFVRD(r7, v4)
- MFVRD(r9, v5)
- cmpld cr6, r7, r9
- bne cr6, L(ret_diff)
- /* Difference in second DW. */
- vsldoi v4, v4, v4, 8
- vsldoi v5, v5, v5, 8
- MFVRD(r7, v4)
- MFVRD(r9, v5)
- cmpld cr6, r7, r9
-L(ret_diff):
- li rRTN, 1
- bgtlr cr6
- li rRTN, -1
- blr
- .align 4
-L(different3):
-#ifdef __LITTLE_ENDIAN__
- /* Reverse bytes for direct comparison. */
- vspltisb v9, 15
- lvsl v10, r0, r0
- vsububm v9, v9, v10
- vperm v6, v6, v0, v9
- vperm v8, v8, v0, v9
-#endif
- MFVRD(r7, v6)
- MFVRD(r9, v8)
- cmpld cr6, r7, r9
- bne cr6, L(ret_diff)
- /* Difference in second DW. */
- vsldoi v6, v6, v6, 8
- vsldoi v8, v8, v8, 8
- MFVRD(r7, v6)
- MFVRD(r9, v8)
- cmpld cr6, r7, r9
- li rRTN, 1
- bgtlr cr6
- li rRTN, -1
- blr
-
- .align 4
-L(different):
- cmpldi cr7, rN, 8
- bgt cr7, L(end)
- /* Skip unwanted bytes. */
- sldi r8, rN, 3
- subfic r8, r8, 64
- srd rWORD1, rWORD1, r8
- srd rWORD2, rWORD2, r8
- cmpld cr6, rWORD1, rWORD2
- li rRTN, 0
- beqlr cr6
-L(end):
- li rRTN, 1
- bgtlr cr6
- li rRTN, -1
- blr
-
- .align 4
-L(unalignedqw):
- /* Proceed to DW unaligned loop,if there is a chance of pagecross. */
- rldicl r9, rSTR1, 0, 52
- add r9, r9, rN
- cmpldi cr0, r9, 4096-16
- bgt cr0, L(unaligned)
- rldicl r9, rSTR2, 0, 52
- add r9, r9, rN
- cmpldi cr0, r9, 4096-16
- bgt cr0, L(unaligned)
- li r0, 0
- li r8, 16
- vspltisb v0, 0
- /* Check if rSTR1 is aligned to QW. */
- andi. r11, rSTR1, 0xF
- beq L(s1_align)
-
- /* Compare 16B and align S1 to QW. */
-#ifdef __LITTLE_ENDIAN__
- lvsr v10, 0, rSTR1 /* Compute mask. */
- lvsr v6, 0, rSTR2 /* Compute mask. */
-#else
- lvsl v10, 0, rSTR1 /* Compute mask. */
- lvsl v6, 0, rSTR2 /* Compute mask. */
-#endif
- lvx v5, 0, rSTR2
- lvx v9, rSTR2, r8
-#ifdef __LITTLE_ENDIAN__
- vperm v5, v9, v5, v6
-#else
- vperm v5, v5, v9, v6
-#endif
- lvx v4, 0, rSTR1
- lvx v9, rSTR1, r8
-#ifdef __LITTLE_ENDIAN__
- vperm v4, v9, v4, v10
-#else
- vperm v4, v4, v9, v10
-#endif
- vcmpequb. v7, v5, v4
- bnl cr6, L(different1)
- cmpldi cr6, rN, 16
- ble cr6, L(zeroLength)
- subfic r11, r11, 16
- subf rN, r11, rN
- add rSTR1, rSTR1, r11
- add rSTR2, rSTR2, r11
-
- /* As s1 is QW aligned prepare for unaligned loop. */
- .align 4
-L(s1_align):
-#ifdef __LITTLE_ENDIAN__
- lvsr v6, 0, rSTR2
-#else
- lvsl v6, 0, rSTR2
-#endif
- lvx v5, 0, rSTR2
- srdi. r6, rN, 6
- li r10, 32
- li r11, 48
- ble cr0, L(lessthan64_unalign)
- mtctr r6
- li r9, 64
- /* Unaligned vector loop. */
- .align 4
-L(unalign_qwloop):
- lvx v4, 0, rSTR1
- lvx v10, rSTR2, r8
-#ifdef __LITTLE_ENDIAN__
- vperm v5, v10, v5, v6
-#else
- vperm v5, v5, v10, v6
-#endif
- vcmpequb. v7, v5, v4
- bnl cr6, L(different2)
- vor v5, v10, v10
- lvx v4, rSTR1, r8
- lvx v10, rSTR2, r10
-#ifdef __LITTLE_ENDIAN__
- vperm v5, v10, v5, v6
-#else
- vperm v5, v5, v10, v6
-#endif
- vcmpequb. v7, v5, v4
- bnl cr6, L(different2)
- vor v5, v10, v10
- lvx v4, rSTR1, r10
- lvx v10, rSTR2, r11
-#ifdef __LITTLE_ENDIAN__
- vperm v5, v10, v5, v6
-#else
- vperm v5, v5, v10, v6
-#endif
- vcmpequb. v7, v5, v4
- bnl cr6, L(different2)
- vor v5, v10, v10
- lvx v4, rSTR1, r11
- lvx v10, rSTR2, r9
-#ifdef __LITTLE_ENDIAN__
- vperm v5, v10, v5, v6
-#else
- vperm v5, v5, v10, v6
-#endif
- vcmpequb. v7, v5, v4
- bnl cr6, L(different2)
- vor v5, v10, v10
- addi rSTR1, rSTR1, 64
- addi rSTR2, rSTR2, 64
- bdnz L(unalign_qwloop)
- clrldi rN, rN, 58
- /* Handle remainder for unaligned loop. */
- .align 4
-L(lessthan64_unalign):
- mr r9, rSTR1
- cmpdi cr6, rN, 0
- li rSTR1, 0
- blelr cr6
- lvx v4, 0, r9
- lvx v10, rSTR2, r8
-#ifdef __LITTLE_ENDIAN__
- vperm v5, v10, v5, v6
-#else
- vperm v5, v5, v10, v6
-#endif
- vcmpequb. v7, v5, v4
- bnl cr6, L(different1)
- vor v5, v10, v10
- addi rN, rN, -16
-
- cmpdi cr6, rN, 0
- blelr cr6
- lvx v4, r9, r8
- lvx v10, rSTR2, r10
-#ifdef __LITTLE_ENDIAN__
- vperm v5, v10, v5, v6
-#else
- vperm v5, v5, v10, v6
-#endif
- vcmpequb. v7, v5, v4
- bnl cr6, L(different1)
- vor v5, v10, v10
- addi rN, rN, -16
-
- cmpdi cr6, rN, 0
- blelr cr6
- lvx v4, r9, r10
- lvx v10, rSTR2, r11
-#ifdef __LITTLE_ENDIAN__
- vperm v5, v10, v5, v6
-#else
- vperm v5, v5, v10, v6
-#endif
- vcmpequb. v7, v5, v4
- bnl cr6, L(different1)
- vor v5, v10, v10
- addi rN, rN, -16
-
- cmpdi cr6, rN, 0
- blelr cr6
- lvx v4, r9, r11
- addi r11, r11, 16
- lvx v10, rSTR2, r11
-#ifdef __LITTLE_ENDIAN__
- vperm v5, v10, v5, v6
-#else
- vperm v5, v5, v10, v6
-#endif
- vcmpequb. v7, v5, v4
- bnl cr6, L(different1)
- blr
-
-/* Otherwise we know the two strings have the same alignment (but not
- yet DW). So we force the string addresses to the next lower DW
- boundary and special case this first DW using shift left to
- eliminate bits preceding the first byte. Since we want to join the
- normal (DW aligned) compare loop, starting at the second double word,
- we need to adjust the length (rN) and special case the loop
- versioning for the first DW. This ensures that the loop count is
- correct and the first DW (shifted) is in the expected register pair. */
- .align 4
-L(DW):
- std rWORD8, rWORD8SAVE(r1)
- std rWORD7, rWORD7SAVE(r1)
- std rOFF8, rOFF8SAVE(r1)
- std rOFF16, rOFF16SAVE(r1)
- std rOFF24, rOFF24SAVE(r1)
- std rOFF32, rOFF32SAVE(r1)
- cfi_offset(rWORD8, rWORD8SAVE)
- cfi_offset(rWORD7, rWORD7SAVE)
- cfi_offset(rOFF8, rOFF8SAVE)
- cfi_offset(rOFF16, rOFF16SAVE)
- cfi_offset(rOFF24, rOFF24SAVE)
- cfi_offset(rOFF32, rOFF32SAVE)
-
- li rOFF8,8
- li rOFF16,16
- li rOFF24,24
- li rOFF32,32
- clrrdi rSTR1, rSTR1, 3
- clrrdi rSTR2, rSTR2, 3
- beq cr5, L(DWaligned)
- add rN, rN, r12
- sldi rWORD6, r12, 3
- srdi r0, rN, 5 /* Divide by 32. */
- andi. r12, rN, 24 /* Get the DW remainder. */
- LD rWORD1, 0, rSTR1
- LD rWORD2, 0, rSTR2
- cmpldi cr1, r12, 16
- cmpldi cr7, rN, 32
- clrldi rN, rN, 61
- beq L(dPs4)
- mtctr r0
- bgt cr1, L(dPs3)
- beq cr1, L(dPs2)
-
-/* Remainder is 8. */
- .align 3
-L(dsP1):
- sld rWORD5, rWORD1, rWORD6
- sld rWORD6, rWORD2, rWORD6
- cmpld cr5, rWORD5, rWORD6
- blt cr7, L(dP1x)
-/* Do something useful in this cycle since we have to branch anyway. */
- LD rWORD1, rOFF8, rSTR1
- LD rWORD2, rOFF8, rSTR2
- cmpld cr7, rWORD1, rWORD2
- b L(dP1e)
-/* Remainder is 16. */
- .align 4
-L(dPs2):
- sld rWORD5, rWORD1, rWORD6
- sld rWORD6, rWORD2, rWORD6
- cmpld cr6, rWORD5, rWORD6
- blt cr7, L(dP2x)
-/* Do something useful in this cycle since we have to branch anyway. */
- LD rWORD7, rOFF8, rSTR1
- LD rWORD8, rOFF8, rSTR2
- cmpld cr5, rWORD7, rWORD8
- b L(dP2e)
-/* Remainder is 24. */
- .align 4
-L(dPs3):
- sld rWORD3, rWORD1, rWORD6
- sld rWORD4, rWORD2, rWORD6
- cmpld cr1, rWORD3, rWORD4
- b L(dP3e)
-/* Count is a multiple of 32, remainder is 0. */
- .align 4
-L(dPs4):
- mtctr r0
- sld rWORD1, rWORD1, rWORD6
- sld rWORD2, rWORD2, rWORD6
- cmpld cr7, rWORD1, rWORD2
- b L(dP4e)
-
-/* At this point we know both strings are double word aligned and the
- compare length is at least 8 bytes. */
- .align 4
-L(DWaligned):
- andi. r12, rN, 24 /* Get the DW remainder. */
- srdi r0, rN, 5 /* Divide by 32. */
- cmpldi cr1, r12, 16
- cmpldi cr7, rN, 32
- clrldi rN, rN, 61
- beq L(dP4)
- bgt cr1, L(dP3)
- beq cr1, L(dP2)
-
-/* Remainder is 8. */
- .align 4
-L(dP1):
- mtctr r0
-/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
- (8-15 byte compare), we want to use only volatile registers. This
- means we can avoid restoring non-volatile registers since we did not
- change any on the early exit path. The key here is the non-early
- exit path only cares about the condition code (cr5), not about which
- register pair was used. */
- LD rWORD5, 0, rSTR1
- LD rWORD6, 0, rSTR2
- cmpld cr5, rWORD5, rWORD6
- blt cr7, L(dP1x)
- LD rWORD1, rOFF8, rSTR1
- LD rWORD2, rOFF8, rSTR2
- cmpld cr7, rWORD1, rWORD2
-L(dP1e):
- LD rWORD3, rOFF16, rSTR1
- LD rWORD4, rOFF16, rSTR2
- cmpld cr1, rWORD3, rWORD4
- LD rWORD5, rOFF24, rSTR1
- LD rWORD6, rOFF24, rSTR2
- cmpld cr6, rWORD5, rWORD6
- bne cr5, L(dLcr5x)
- bne cr7, L(dLcr7x)
-
- LD rWORD7, rOFF32, rSTR1
- LD rWORD8, rOFF32, rSTR2
- addi rSTR1, rSTR1, 32
- addi rSTR2, rSTR2, 32
- bne cr1, L(dLcr1)
- cmpld cr5, rWORD7, rWORD8
- bdnz L(dLoop)
- bne cr6, L(dLcr6)
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
- .align 3
-L(dP1x):
- sldi. r12, rN, 3
- bne cr5, L(dLcr5x)
- subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
- bne L(d00)
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- li rRTN, 0
- blr
-
-/* Remainder is 16. */
- .align 4
-L(dP2):
- mtctr r0
- LD rWORD5, 0, rSTR1
- LD rWORD6, 0, rSTR2
- cmpld cr6, rWORD5, rWORD6
- blt cr7, L(dP2x)
- LD rWORD7, rOFF8, rSTR1
- LD rWORD8, rOFF8, rSTR2
- cmpld cr5, rWORD7, rWORD8
-L(dP2e):
- LD rWORD1, rOFF16, rSTR1
- LD rWORD2, rOFF16, rSTR2
- cmpld cr7, rWORD1, rWORD2
- LD rWORD3, rOFF24, rSTR1
- LD rWORD4, rOFF24, rSTR2
- cmpld cr1, rWORD3, rWORD4
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
- bne cr6, L(dLcr6)
- bne cr5, L(dLcr5)
- b L(dLoop2)
- .align 4
-L(dP2x):
- LD rWORD3, rOFF8, rSTR1
- LD rWORD4, rOFF8, rSTR2
- cmpld cr1, rWORD3, rWORD4
- sldi. r12, rN, 3
- bne cr6, L(dLcr6x)
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
- bne cr1, L(dLcr1x)
- subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
- bne L(d00)
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- li rRTN, 0
- blr
-
-/* Remainder is 24. */
- .align 4
-L(dP3):
- mtctr r0
- LD rWORD3, 0, rSTR1
- LD rWORD4, 0, rSTR2
- cmpld cr1, rWORD3, rWORD4
-L(dP3e):
- LD rWORD5, rOFF8, rSTR1
- LD rWORD6, rOFF8, rSTR2
- cmpld cr6, rWORD5, rWORD6
- blt cr7, L(dP3x)
- LD rWORD7, rOFF16, rSTR1
- LD rWORD8, rOFF16, rSTR2
- cmpld cr5, rWORD7, rWORD8
- LD rWORD1, rOFF24, rSTR1
- LD rWORD2, rOFF24, rSTR2
- cmpld cr7, rWORD1, rWORD2
- addi rSTR1, rSTR1, 16
- addi rSTR2, rSTR2, 16
- bne cr1, L(dLcr1)
- bne cr6, L(dLcr6)
- b L(dLoop1)
-/* Again we are on a early exit path (24-31 byte compare), we want to
- only use volatile registers and avoid restoring non-volatile
- registers. */
- .align 4
-L(dP3x):
- LD rWORD1, rOFF16, rSTR1
- LD rWORD2, rOFF16, rSTR2
- cmpld cr7, rWORD1, rWORD2
- sldi. r12, rN, 3
- bne cr1, L(dLcr1x)
- addi rSTR1, rSTR1, 16
- addi rSTR2, rSTR2, 16
- bne cr6, L(dLcr6x)
- subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
- bne cr7, L(dLcr7x)
- bne L(d00)
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- li rRTN, 0
- blr
-
-/* Count is a multiple of 32, remainder is 0. */
- .align 4
-L(dP4):
- mtctr r0
- LD rWORD1, 0, rSTR1
- LD rWORD2, 0, rSTR2
- cmpld cr7, rWORD1, rWORD2
-L(dP4e):
- LD rWORD3, rOFF8, rSTR1
- LD rWORD4, rOFF8, rSTR2
- cmpld cr1, rWORD3, rWORD4
- LD rWORD5, rOFF16, rSTR1
- LD rWORD6, rOFF16, rSTR2
- cmpld cr6, rWORD5, rWORD6
- LD rWORD7, rOFF24, rSTR1
- LD rWORD8, rOFF24, rSTR2
- addi rSTR1, rSTR1, 24
- addi rSTR2, rSTR2, 24
- cmpld cr5, rWORD7, rWORD8
- bne cr7, L(dLcr7)
- bne cr1, L(dLcr1)
- bdz- L(d24) /* Adjust CTR as we start with +4. */
-/* This is the primary loop. */
- .align 4
-L(dLoop):
- LD rWORD1, rOFF8, rSTR1
- LD rWORD2, rOFF8, rSTR2
- cmpld cr1, rWORD3, rWORD4
- bne cr6, L(dLcr6)
-L(dLoop1):
- LD rWORD3, rOFF16, rSTR1
- LD rWORD4, rOFF16, rSTR2
- cmpld cr6, rWORD5, rWORD6
- bne cr5, L(dLcr5)
-L(dLoop2):
- LD rWORD5, rOFF24, rSTR1
- LD rWORD6, rOFF24, rSTR2
- cmpld cr5, rWORD7, rWORD8
- bne cr7, L(dLcr7)
-L(dLoop3):
- LD rWORD7, rOFF32, rSTR1
- LD rWORD8, rOFF32, rSTR2
- addi rSTR1, rSTR1, 32
- addi rSTR2, rSTR2, 32
- bne cr1, L(dLcr1)
- cmpld cr7, rWORD1, rWORD2
- bdnz L(dLoop)
-
-L(dL4):
- cmpld cr1, rWORD3, rWORD4
- bne cr6, L(dLcr6)
- cmpld cr6, rWORD5, rWORD6
- bne cr5, L(dLcr5)
- cmpld cr5, rWORD7, rWORD8
-L(d44):
- bne cr7, L(dLcr7)
-L(d34):
- bne cr1, L(dLcr1)
-L(d24):
- bne cr6, L(dLcr6)
-L(d14):
- sldi. r12, rN, 3
- bne cr5, L(dLcr5)
-L(d04):
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
- subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
- beq L(duzeroLength)
-/* At this point we have a remainder of 1 to 7 bytes to compare. Since
- we are aligned it is safe to load the whole double word, and use
- shift right double to eliminate bits beyond the compare length. */
-L(d00):
- LD rWORD1, rOFF8, rSTR1
- LD rWORD2, rOFF8, rSTR2
- srd rWORD1, rWORD1, rN
- srd rWORD2, rWORD2, rN
- cmpld cr7, rWORD1, rWORD2
- bne cr7, L(dLcr7x)
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- li rRTN, 0
- blr
-
- .align 4
-L(dLcr7):
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
-L(dLcr7x):
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- li rRTN, 1
- bgtlr cr7
- li rRTN, -1
- blr
- .align 4
-L(dLcr1):
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
-L(dLcr1x):
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- li rRTN, 1
- bgtlr cr1
- li rRTN, -1
- blr
- .align 4
-L(dLcr6):
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
-L(dLcr6x):
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- li rRTN, 1
- bgtlr cr6
- li rRTN, -1
- blr
- .align 4
-L(dLcr5):
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
-L(dLcr5x):
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- li rRTN, 1
- bgtlr cr5
- li rRTN, -1
- blr
-
- .align 4
-L(bytealigned):
- mtctr rN
-
-/* We need to prime this loop. This loop is swing modulo scheduled
- to avoid pipe delays. The dependent instruction latencies (load to
- compare to conditional branch) is 2 to 3 cycles. In this loop each
- dispatch group ends in a branch and takes 1 cycle. Effectively
- the first iteration of the loop only serves to load operands and
- branches based on compares are delayed until the next loop.
-
- So we must precondition some registers and condition codes so that
- we don't exit the loop early on the first iteration. */
-
- lbz rWORD1, 0(rSTR1)
- lbz rWORD2, 0(rSTR2)
- bdz L(b11)
- cmpld cr7, rWORD1, rWORD2
- lbz rWORD3, 1(rSTR1)
- lbz rWORD4, 1(rSTR2)
- bdz L(b12)
- cmpld cr1, rWORD3, rWORD4
- lbzu rWORD5, 2(rSTR1)
- lbzu rWORD6, 2(rSTR2)
- bdz L(b13)
- .align 4
-L(bLoop):
- lbzu rWORD1, 1(rSTR1)
- lbzu rWORD2, 1(rSTR2)
- bne cr7, L(bLcr7)
-
- cmpld cr6, rWORD5, rWORD6
- bdz L(b3i)
-
- lbzu rWORD3, 1(rSTR1)
- lbzu rWORD4, 1(rSTR2)
- bne cr1, L(bLcr1)
-
- cmpld cr7, rWORD1, rWORD2
- bdz L(b2i)
-
- lbzu rWORD5, 1(rSTR1)
- lbzu rWORD6, 1(rSTR2)
- bne cr6, L(bLcr6)
-
- cmpld cr1, rWORD3, rWORD4
- bdnz L(bLoop)
-
-/* We speculatively loading bytes before we have tested the previous
- bytes. But we must avoid overrunning the length (in the ctr) to
- prevent these speculative loads from causing a segfault. In this
- case the loop will exit early (before the all pending bytes are
- tested. In this case we must complete the pending operations
- before returning. */
-L(b1i):
- bne cr7, L(bLcr7)
- bne cr1, L(bLcr1)
- b L(bx56)
- .align 4
-L(b2i):
- bne cr6, L(bLcr6)
- bne cr7, L(bLcr7)
- b L(bx34)
- .align 4
-L(b3i):
- bne cr1, L(bLcr1)
- bne cr6, L(bLcr6)
- b L(bx12)
- .align 4
-L(bLcr7):
- li rRTN, 1
- bgtlr cr7
- li rRTN, -1
- blr
-L(bLcr1):
- li rRTN, 1
- bgtlr cr1
- li rRTN, -1
- blr
-L(bLcr6):
- li rRTN, 1
- bgtlr cr6
- li rRTN, -1
- blr
-
-L(b13):
- bne cr7, L(bx12)
- bne cr1, L(bx34)
-L(bx56):
- sub rRTN, rWORD5, rWORD6
- blr
- nop
-L(b12):
- bne cr7, L(bx12)
-L(bx34):
- sub rRTN, rWORD3, rWORD4
- blr
-L(b11):
-L(bx12):
- sub rRTN, rWORD1, rWORD2
- blr
-
- .align 4
-L(zeroLength):
- li rRTN, 0
- blr
-
- .align 4
-/* At this point we know the strings have different alignment and the
- compare length is at least 8 bytes. r12 contains the low order
- 3 bits of rSTR1 and cr5 contains the result of the logical compare
- of r12 to 0. If r12 == 0 then rStr1 is double word
- aligned and can perform the DWunaligned loop.
-
- Otherwise we know that rSTR1 is not already DW aligned yet.
- So we can force the string addresses to the next lower DW
- boundary and special case this first DW using shift left to
- eliminate bits preceding the first byte. Since we want to join the
- normal (DWaligned) compare loop, starting at the second double word,
- we need to adjust the length (rN) and special case the loop
- versioning for the first DW. This ensures that the loop count is
- correct and the first DW (shifted) is in the expected resister pair. */
-L(unaligned):
- std rWORD8, rWORD8SAVE(r1)
- std rWORD7, rWORD7SAVE(r1)
- std rOFF8, rOFF8SAVE(r1)
- std rOFF16, rOFF16SAVE(r1)
- std rOFF24, rOFF24SAVE(r1)
- std rOFF32, rOFF32SAVE(r1)
- cfi_offset(rWORD8, rWORD8SAVE)
- cfi_offset(rWORD7, rWORD7SAVE)
- cfi_offset(rOFF8, rOFF8SAVE)
- cfi_offset(rOFF16, rOFF16SAVE)
- cfi_offset(rOFF24, rOFF24SAVE)
- cfi_offset(rOFF32, rOFF32SAVE)
- li rOFF8,8
- li rOFF16,16
- li rOFF24,24
- li rOFF32,32
- std rSHL, rSHLSAVE(r1)
- cfi_offset(rSHL, rSHLSAVE)
- clrldi rSHL, rSTR2, 61
- beq cr6, L(duzeroLength)
- std rSHR, rSHRSAVE(r1)
- cfi_offset(rSHR, rSHRSAVE)
- beq cr5, L(DWunaligned)
- std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
- cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
-/* Adjust the logical start of rSTR2 to compensate for the extra bits
- in the 1st rSTR1 DW. */
- sub rWORD8_SHIFT, rSTR2, r12
-/* But do not attempt to address the DW before that DW that contains
- the actual start of rSTR2. */
- clrrdi rSTR2, rSTR2, 3
- std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
-/* Compute the left/right shift counts for the unaligned rSTR2,
- compensating for the logical (DW aligned) start of rSTR1. */
- clrldi rSHL, rWORD8_SHIFT, 61
- clrrdi rSTR1, rSTR1, 3
- std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
- sldi rSHL, rSHL, 3
- cmpld cr5, rWORD8_SHIFT, rSTR2
- add rN, rN, r12
- sldi rWORD6, r12, 3
- std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
- cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
- cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
- cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
- subfic rSHR, rSHL, 64
- srdi r0, rN, 5 /* Divide by 32. */
- andi. r12, rN, 24 /* Get the DW remainder. */
-/* We normally need to load 2 DWs to start the unaligned rSTR2, but in
- this special case those bits may be discarded anyway. Also we
- must avoid loading a DW where none of the bits are part of rSTR2 as
- this may cross a page boundary and cause a page fault. */
- li rWORD8, 0
- blt cr5, L(dus0)
- LD rWORD8, 0, rSTR2
- addi rSTR2, rSTR2, 8
- sld rWORD8, rWORD8, rSHL
-
-L(dus0):
- LD rWORD1, 0, rSTR1
- LD rWORD2, 0, rSTR2
- cmpldi cr1, r12, 16
- cmpldi cr7, rN, 32
- srd r12, rWORD2, rSHR
- clrldi rN, rN, 61
- beq L(duPs4)
- mtctr r0
- or rWORD8, r12, rWORD8
- bgt cr1, L(duPs3)
- beq cr1, L(duPs2)
-
-/* Remainder is 8. */
- .align 4
-L(dusP1):
- sld rWORD8_SHIFT, rWORD2, rSHL
- sld rWORD7, rWORD1, rWORD6
- sld rWORD8, rWORD8, rWORD6
- bge cr7, L(duP1e)
-/* At this point we exit early with the first double word compare
- complete and remainder of 0 to 7 bytes. See L(du14) for details on
- how we handle the remaining bytes. */
- cmpld cr5, rWORD7, rWORD8
- sldi. rN, rN, 3
- bne cr5, L(duLcr5)
- cmpld cr7, rN, rSHR
- beq L(duZeroReturn)
- li r0, 0
- ble cr7, L(dutrim)
- LD rWORD2, rOFF8, rSTR2
- srd r0, rWORD2, rSHR
- b L(dutrim)
-/* Remainder is 16. */
- .align 4
-L(duPs2):
- sld rWORD6_SHIFT, rWORD2, rSHL
- sld rWORD5, rWORD1, rWORD6
- sld rWORD6, rWORD8, rWORD6
- b L(duP2e)
-/* Remainder is 24. */
- .align 4
-L(duPs3):
- sld rWORD4_SHIFT, rWORD2, rSHL
- sld rWORD3, rWORD1, rWORD6
- sld rWORD4, rWORD8, rWORD6
- b L(duP3e)
-/* Count is a multiple of 32, remainder is 0. */
- .align 4
-L(duPs4):
- mtctr r0
- or rWORD8, r12, rWORD8
- sld rWORD2_SHIFT, rWORD2, rSHL
- sld rWORD1, rWORD1, rWORD6
- sld rWORD2, rWORD8, rWORD6
- b L(duP4e)
-
-/* At this point we know rSTR1 is double word aligned and the
- compare length is at least 8 bytes. */
- .align 4
-L(DWunaligned):
- std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
- clrrdi rSTR2, rSTR2, 3
- std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
- srdi r0, rN, 5 /* Divide by 32. */
- std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
- andi. r12, rN, 24 /* Get the DW remainder. */
- std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
- cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
- cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
- cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
- cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
- sldi rSHL, rSHL, 3
- LD rWORD6, 0, rSTR2
- LD rWORD8, rOFF8, rSTR2
- addi rSTR2, rSTR2, 8
- cmpldi cr1, r12, 16
- cmpldi cr7, rN, 32
- clrldi rN, rN, 61
- subfic rSHR, rSHL, 64
- sld rWORD6_SHIFT, rWORD6, rSHL
- beq L(duP4)
- mtctr r0
- bgt cr1, L(duP3)
- beq cr1, L(duP2)
-
-/* Remainder is 8. */
- .align 4
-L(duP1):
- srd r12, rWORD8, rSHR
- LD rWORD7, 0, rSTR1
- sld rWORD8_SHIFT, rWORD8, rSHL
- or rWORD8, r12, rWORD6_SHIFT
- blt cr7, L(duP1x)
-L(duP1e):
- LD rWORD1, rOFF8, rSTR1
- LD rWORD2, rOFF8, rSTR2
- cmpld cr5, rWORD7, rWORD8
- srd r0, rWORD2, rSHR
- sld rWORD2_SHIFT, rWORD2, rSHL
- or rWORD2, r0, rWORD8_SHIFT
- LD rWORD3, rOFF16, rSTR1
- LD rWORD4, rOFF16, rSTR2
- cmpld cr7, rWORD1, rWORD2
- srd r12, rWORD4, rSHR
- sld rWORD4_SHIFT, rWORD4, rSHL
- bne cr5, L(duLcr5)
- or rWORD4, r12, rWORD2_SHIFT
- LD rWORD5, rOFF24, rSTR1
- LD rWORD6, rOFF24, rSTR2
- cmpld cr1, rWORD3, rWORD4
- srd r0, rWORD6, rSHR
- sld rWORD6_SHIFT, rWORD6, rSHL
- bne cr7, L(duLcr7)
- or rWORD6, r0, rWORD4_SHIFT
- cmpld cr6, rWORD5, rWORD6
- b L(duLoop3)
- .align 4
-/* At this point we exit early with the first double word compare
- complete and remainder of 0 to 7 bytes. See L(du14) for details on
- how we handle the remaining bytes. */
-L(duP1x):
- cmpld cr5, rWORD7, rWORD8
- sldi. rN, rN, 3
- bne cr5, L(duLcr5)
- cmpld cr7, rN, rSHR
- beq L(duZeroReturn)
- li r0, 0
- ble cr7, L(dutrim)
- LD rWORD2, rOFF8, rSTR2
- srd r0, rWORD2, rSHR
- b L(dutrim)
-/* Remainder is 16. */
- .align 4
-L(duP2):
- srd r0, rWORD8, rSHR
- LD rWORD5, 0, rSTR1
- or rWORD6, r0, rWORD6_SHIFT
- sld rWORD6_SHIFT, rWORD8, rSHL
-L(duP2e):
- LD rWORD7, rOFF8, rSTR1
- LD rWORD8, rOFF8, rSTR2
- cmpld cr6, rWORD5, rWORD6
- srd r12, rWORD8, rSHR
- sld rWORD8_SHIFT, rWORD8, rSHL
- or rWORD8, r12, rWORD6_SHIFT
- blt cr7, L(duP2x)
- LD rWORD1, rOFF16, rSTR1
- LD rWORD2, rOFF16, rSTR2
- cmpld cr5, rWORD7, rWORD8
- bne cr6, L(duLcr6)
- srd r0, rWORD2, rSHR
- sld rWORD2_SHIFT, rWORD2, rSHL
- or rWORD2, r0, rWORD8_SHIFT
- LD rWORD3, rOFF24, rSTR1
- LD rWORD4, rOFF24, rSTR2
- cmpld cr7, rWORD1, rWORD2
- bne cr5, L(duLcr5)
- srd r12, rWORD4, rSHR
- sld rWORD4_SHIFT, rWORD4, rSHL
- or rWORD4, r12, rWORD2_SHIFT
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
- cmpld cr1, rWORD3, rWORD4
- b L(duLoop2)
- .align 4
-L(duP2x):
- cmpld cr5, rWORD7, rWORD8
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
- bne cr6, L(duLcr6)
- sldi. rN, rN, 3
- bne cr5, L(duLcr5)
- cmpld cr7, rN, rSHR
- beq L(duZeroReturn)
- li r0, 0
- ble cr7, L(dutrim)
- LD rWORD2, rOFF8, rSTR2
- srd r0, rWORD2, rSHR
- b L(dutrim)
-
-/* Remainder is 24. */
- .align 4
-L(duP3):
- srd r12, rWORD8, rSHR
- LD rWORD3, 0, rSTR1
- sld rWORD4_SHIFT, rWORD8, rSHL
- or rWORD4, r12, rWORD6_SHIFT
-L(duP3e):
- LD rWORD5, rOFF8, rSTR1
- LD rWORD6, rOFF8, rSTR2
- cmpld cr1, rWORD3, rWORD4
- srd r0, rWORD6, rSHR
- sld rWORD6_SHIFT, rWORD6, rSHL
- or rWORD6, r0, rWORD4_SHIFT
- LD rWORD7, rOFF16, rSTR1
- LD rWORD8, rOFF16, rSTR2
- cmpld cr6, rWORD5, rWORD6
- bne cr1, L(duLcr1)
- srd r12, rWORD8, rSHR
- sld rWORD8_SHIFT, rWORD8, rSHL
- or rWORD8, r12, rWORD6_SHIFT
- blt cr7, L(duP3x)
- LD rWORD1, rOFF24, rSTR1
- LD rWORD2, rOFF24, rSTR2
- cmpld cr5, rWORD7, rWORD8
- bne cr6, L(duLcr6)
- srd r0, rWORD2, rSHR
- sld rWORD2_SHIFT, rWORD2, rSHL
- or rWORD2, r0, rWORD8_SHIFT
- addi rSTR1, rSTR1, 16
- addi rSTR2, rSTR2, 16
- cmpld cr7, rWORD1, rWORD2
- b L(duLoop1)
- .align 4
-L(duP3x):
- addi rSTR1, rSTR1, 16
- addi rSTR2, rSTR2, 16
- cmpld cr5, rWORD7, rWORD8
- bne cr6, L(duLcr6)
- sldi. rN, rN, 3
- bne cr5, L(duLcr5)
- cmpld cr7, rN, rSHR
- beq L(duZeroReturn)
- li r0, 0
- ble cr7, L(dutrim)
- LD rWORD2, rOFF8, rSTR2
- srd r0, rWORD2, rSHR
- b L(dutrim)
-
-/* Count is a multiple of 32, remainder is 0. */
- .align 4
-L(duP4):
- mtctr r0
- srd r0, rWORD8, rSHR
- LD rWORD1, 0, rSTR1
- sld rWORD2_SHIFT, rWORD8, rSHL
- or rWORD2, r0, rWORD6_SHIFT
-L(duP4e):
- LD rWORD3, rOFF8, rSTR1
- LD rWORD4, rOFF8, rSTR2
- cmpld cr7, rWORD1, rWORD2
- srd r12, rWORD4, rSHR
- sld rWORD4_SHIFT, rWORD4, rSHL
- or rWORD4, r12, rWORD2_SHIFT
- LD rWORD5, rOFF16, rSTR1
- LD rWORD6, rOFF16, rSTR2
- cmpld cr1, rWORD3, rWORD4
- bne cr7, L(duLcr7)
- srd r0, rWORD6, rSHR
- sld rWORD6_SHIFT, rWORD6, rSHL
- or rWORD6, r0, rWORD4_SHIFT
- LD rWORD7, rOFF24, rSTR1
- LD rWORD8, rOFF24, rSTR2
- addi rSTR1, rSTR1, 24
- addi rSTR2, rSTR2, 24
- cmpld cr6, rWORD5, rWORD6
- bne cr1, L(duLcr1)
- srd r12, rWORD8, rSHR
- sld rWORD8_SHIFT, rWORD8, rSHL
- or rWORD8, r12, rWORD6_SHIFT
- cmpld cr5, rWORD7, rWORD8
- bdz L(du24) /* Adjust CTR as we start with +4. */
-/* This is the primary loop. */
- .align 4
-L(duLoop):
- LD rWORD1, rOFF8, rSTR1
- LD rWORD2, rOFF8, rSTR2
- cmpld cr1, rWORD3, rWORD4
- bne cr6, L(duLcr6)
- srd r0, rWORD2, rSHR
- sld rWORD2_SHIFT, rWORD2, rSHL
- or rWORD2, r0, rWORD8_SHIFT
-L(duLoop1):
- LD rWORD3, rOFF16, rSTR1
- LD rWORD4, rOFF16, rSTR2
- cmpld cr6, rWORD5, rWORD6
- bne cr5, L(duLcr5)
- srd r12, rWORD4, rSHR
- sld rWORD4_SHIFT, rWORD4, rSHL
- or rWORD4, r12, rWORD2_SHIFT
-L(duLoop2):
- LD rWORD5, rOFF24, rSTR1
- LD rWORD6, rOFF24, rSTR2
- cmpld cr5, rWORD7, rWORD8
- bne cr7, L(duLcr7)
- srd r0, rWORD6, rSHR
- sld rWORD6_SHIFT, rWORD6, rSHL
- or rWORD6, r0, rWORD4_SHIFT
-L(duLoop3):
- LD rWORD7, rOFF32, rSTR1
- LD rWORD8, rOFF32, rSTR2
- addi rSTR1, rSTR1, 32
- addi rSTR2, rSTR2, 32
- cmpld cr7, rWORD1, rWORD2
- bne cr1, L(duLcr1)
- srd r12, rWORD8, rSHR
- sld rWORD8_SHIFT, rWORD8, rSHL
- or rWORD8, r12, rWORD6_SHIFT
- bdnz L(duLoop)
-
-L(duL4):
- cmpld cr1, rWORD3, rWORD4
- bne cr6, L(duLcr6)
- cmpld cr6, rWORD5, rWORD6
- bne cr5, L(duLcr5)
- cmpld cr5, rWORD7, rWORD8
-L(du44):
- bne cr7, L(duLcr7)
-L(du34):
- bne cr1, L(duLcr1)
-L(du24):
- bne cr6, L(duLcr6)
-L(du14):
- sldi. rN, rN, 3
- bne cr5, L(duLcr5)
-/* At this point we have a remainder of 1 to 7 bytes to compare. We use
- shift right double to eliminate bits beyond the compare length.
-
- However it may not be safe to load rWORD2 which may be beyond the
- string length. So we compare the bit length of the remainder to
- the right shift count (rSHR). If the bit count is less than or equal
- we do not need to load rWORD2 (all significant bits are already in
- rWORD8_SHIFT). */
- cmpld cr7, rN, rSHR
- beq L(duZeroReturn)
- li r0, 0
- ble cr7, L(dutrim)
- LD rWORD2, rOFF8, rSTR2
- srd r0, rWORD2, rSHR
- .align 4
-L(dutrim):
- LD rWORD1, rOFF8, rSTR1
- ld rWORD8, -8(r1)
- subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
- or rWORD2, r0, rWORD8_SHIFT
- ld rWORD7, rWORD7SAVE(r1)
- ld rSHL, rSHLSAVE(r1)
- srd rWORD1, rWORD1, rN
- srd rWORD2, rWORD2, rN
- ld rSHR, rSHRSAVE(r1)
- ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
- li rRTN, 0
- cmpld cr7, rWORD1, rWORD2
- ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
- ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
- beq cr7, L(dureturn24)
- li rRTN, 1
- ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- bgtlr cr7
- li rRTN, -1
- blr
- .align 4
-L(duLcr7):
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
- li rRTN, 1
- bgt cr7, L(dureturn29)
- ld rSHL, rSHLSAVE(r1)
- ld rSHR, rSHRSAVE(r1)
- li rRTN, -1
- b L(dureturn27)
- .align 4
-L(duLcr1):
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
- li rRTN, 1
- bgt cr1, L(dureturn29)
- ld rSHL, rSHLSAVE(r1)
- ld rSHR, rSHRSAVE(r1)
- li rRTN, -1
- b L(dureturn27)
- .align 4
-L(duLcr6):
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
- li rRTN, 1
- bgt cr6, L(dureturn29)
- ld rSHL, rSHLSAVE(r1)
- ld rSHR, rSHRSAVE(r1)
- li rRTN, -1
- b L(dureturn27)
- .align 4
-L(duLcr5):
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
- li rRTN, 1
- bgt cr5, L(dureturn29)
- ld rSHL, rSHLSAVE(r1)
- ld rSHR, rSHRSAVE(r1)
- li rRTN, -1
- b L(dureturn27)
-
- .align 3
-L(duZeroReturn):
- li rRTN, 0
- .align 4
-L(dureturn):
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
-L(dureturn29):
- ld rSHL, rSHLSAVE(r1)
- ld rSHR, rSHRSAVE(r1)
-L(dureturn27):
- ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
- ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
- ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
-L(dureturn24):
- ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- blr
-
-L(duzeroLength):
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- li rRTN, 0
- blr
-
-END (MEMCMP)
-libc_hidden_builtin_def (memcmp)
-weak_alias (memcmp, bcmp)
diff --git a/sysdeps/powerpc/powerpc64/power8/memset.S b/sysdeps/powerpc/powerpc64/power8/memset.S
deleted file mode 100644
index bc734c9f4f..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/memset.S
+++ /dev/null
@@ -1,458 +0,0 @@
-/* Optimized memset implementation for PowerPC64/POWER8.
- Copyright (C) 2014-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#define MTVSRD_V1_R4 .long 0x7c240166 /* mtvsrd v1,r4 */
-
-/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
- Returns 's'. */
-
-#ifndef MEMSET
-# define MEMSET memset
-#endif
-
- /* No need to use .machine power8 since mtvsrd is already
- handled by the define. It avoid breakage on binutils
- that does not support this machine specifier. */
- .machine power7
-EALIGN (MEMSET, 5, 0)
- CALL_MCOUNT 3
-
-L(_memset):
- cmpldi cr7,r5,31
- neg r0,r3
- mr r10,r3
-
- insrdi r4,r4,8,48
- insrdi r4,r4,16,32 /* Replicate byte to word. */
- ble cr7,L(write_LT_32)
-
- andi. r11,r10,15 /* Check alignment of DST. */
- insrdi r4,r4,32,0 /* Replicate word to double word. */
-
- beq L(big_aligned)
-
- mtocrf 0x01,r0
- clrldi r0,r0,60
-
- /* Get DST aligned to 16 bytes. */
-1: bf 31,2f
- stb r4,0(r10)
- addi r10,r10,1
-
-2: bf 30,4f
- sth r4,0(r10)
- addi r10,r10,2
-
-4: bf 29,8f
- stw r4,0(r10)
- addi r10,r10,4
-
-8: bf 28,16f
- std r4,0(r10)
- addi r10,r10,8
-
-16: subf r5,r0,r5
-
- .align 4
-L(big_aligned):
- /* For sizes larger than 255 two possible paths:
- - if constant is '0', zero full cache lines with dcbz
- - otherwise uses vector instructions. */
- cmpldi cr5,r5,255
- dcbtst 0,r10
- cmpldi cr6,r4,0
- crand 27,26,21
- bt 27,L(huge_dcbz)
- bge cr5,L(huge_vector)
-
-
- /* Size between 32 and 255 bytes with constant different than 0, use
- doubleword store instruction to achieve best throughput. */
- srdi r8,r5,5
- clrldi r11,r5,59
- cmpldi cr6,r11,0
- cmpdi r8,0
- beq L(tail_bytes)
- mtctr r8
-
- /* Main aligned write loop, writes 32-bytes at a time. */
- .align 4
-L(big_loop):
- std r4,0(r10)
- std r4,8(r10)
- std r4,16(r10)
- std r4,24(r10)
- addi r10,r10,32
- bdz L(tail_bytes)
-
- std r4,0(r10)
- std r4,8(r10)
- std r4,16(r10)
- std r4,24(r10)
- addi r10,10,32
- bdnz L(big_loop)
-
- b L(tail_bytes)
-
- /* Write remaining 1~31 bytes. */
- .align 4
-L(tail_bytes):
- beqlr cr6
-
- srdi r7,r11,4
- clrldi r8,r11,60
- mtocrf 0x01,r7
-
- .align 4
- bf 31,8f
- std r4,0(r10)
- std r4,8(r10)
- addi r10,r10,16
-
- .align 4
-8: mtocrf 0x1,r8
- bf 28,4f
- std r4,0(r10)
- addi r10,r10,8
-
- .align 4
-4: bf 29,2f
- stw 4,0(10)
- addi 10,10,4
-
- .align 4
-2: bf 30,1f
- sth 4,0(10)
- addi 10,10,2
-
- .align 4
-1: bflr 31
- stb 4,0(10)
- blr
-
- /* Size larger than 255 bytes with constant different than 0, use
- vector instruction to achieve best throughput. */
-L(huge_vector):
- /* Replicate set byte to quadword in VMX register. */
- MTVSRD_V1_R4
- xxpermdi 32,v0,v1,0
- vspltb v2,v0,15
-
- /* Main aligned write loop: 128 bytes at a time. */
- li r6,16
- li r7,32
- li r8,48
- mtocrf 0x02,r5
- srdi r12,r5,7
- cmpdi r12,0
- beq L(aligned_tail)
- mtctr r12
- b L(aligned_128loop)
-
- .align 4
-L(aligned_128loop):
- stvx v2,0,r10
- stvx v2,r10,r6
- stvx v2,r10,r7
- stvx v2,r10,r8
- addi r10,r10,64
- stvx v2,0,r10
- stvx v2,r10,r6
- stvx v2,r10,r7
- stvx v2,r10,r8
- addi r10,r10,64
- bdnz L(aligned_128loop)
-
- /* Write remaining 1~127 bytes. */
-L(aligned_tail):
- mtocrf 0x01,r5
- bf 25,32f
- stvx v2,0,r10
- stvx v2,r10,r6
- stvx v2,r10,r7
- stvx v2,r10,r8
- addi r10,r10,64
-
-32: bf 26,16f
- stvx v2,0,r10
- stvx v2,r10,r6
- addi r10,r10,32
-
-16: bf 27,8f
- stvx v2,0,r10
- addi r10,r10,16
-
-8: bf 28,4f
- std r4,0(r10)
- addi r10,r10,8
-
- /* Copies 4~7 bytes. */
-4: bf 29,L(tail2)
- stw r4,0(r10)
- bf 30,L(tail5)
- sth r4,4(r10)
- bflr 31
- stb r4,6(r10)
- /* Return original DST pointer. */
- blr
-
- /* Special case when value is 0 and we have a long length to deal
- with. Use dcbz to zero out a full cacheline of 128 bytes at a time.
- Before using dcbz though, we need to get the destination 128-byte
- aligned. */
- .align 4
-L(huge_dcbz):
- andi. r11,r10,127
- neg r0,r10
- beq L(huge_dcbz_aligned)
-
- clrldi r0,r0,57
- subf r5,r0,r5
- srdi r0,r0,3
- mtocrf 0x01,r0
-
- /* Write 1~128 bytes until DST is aligned to 128 bytes. */
-8: bf 28,4f
-
- std r4,0(r10)
- std r4,8(r10)
- std r4,16(r10)
- std r4,24(r10)
- std r4,32(r10)
- std r4,40(r10)
- std r4,48(r10)
- std r4,56(r10)
- addi r10,r10,64
-
- .align 4
-4: bf 29,2f
- std r4,0(r10)
- std r4,8(r10)
- std r4,16(r10)
- std r4,24(r10)
- addi r10,r10,32
-
- .align 4
-2: bf 30,1f
- std r4,0(r10)
- std r4,8(r10)
- addi r10,r10,16
-
- .align 4
-1: bf 31,L(huge_dcbz_aligned)
- std r4,0(r10)
- addi r10,r10,8
-
-L(huge_dcbz_aligned):
- /* Setup dcbz unroll offsets and count numbers. */
- srdi r8,r5,9
- clrldi r11,r5,55
- cmpldi cr6,r11,0
- li r9,128
- cmpdi r8,0
- beq L(huge_tail)
- li r7,256
- li r6,384
- mtctr r8
-
- .align 4
-L(huge_loop):
- /* Sets 512 bytes to zero in each iteration, the loop unrolling shows
- a throughput boost for large sizes (2048 bytes or higher). */
- dcbz 0,r10
- dcbz r9,r10
- dcbz r7,r10
- dcbz r6,r10
- addi r10,r10,512
- bdnz L(huge_loop)
-
- beqlr cr6
-
-L(huge_tail):
- srdi r6,r11,8
- srdi r7,r11,4
- clrldi r8,r11,4
- cmpldi cr6,r8,0
- mtocrf 0x01,r6
-
- beq cr6,L(tail)
-
- /* We have 1~511 bytes remaining. */
- .align 4
-32: bf 31,16f
- dcbz 0,r10
- dcbz r9,r10
- addi r10,r10,256
-
- .align 4
-16: mtocrf 0x01,r7
- bf 28,8f
- dcbz 0,r10
- addi r10,r10,128
-
- .align 4
-8: bf 29,4f
- std r4,0(r10)
- std r4,8(r10)
- std r4,16(r10)
- std r4,24(r10)
- std r4,32(r10)
- std r4,40(r10)
- std r4,48(r10)
- std r4,56(r10)
- addi r10,r10,64
-
- .align 4
-4: bf 30,2f
- std r4,0(r10)
- std r4,8(r10)
- std r4,16(r10)
- std r4,24(r10)
- addi r10,r10,32
-
- .align 4
-2: bf 31,L(tail)
- std r4,0(r10)
- std r4,8(r10)
- addi r10,r10,16
- .align 4
-
- /* Remaining 1~15 bytes. */
-L(tail):
- mtocrf 0x01,r8
-
- .align
-8: bf 28,4f
- std r4,0(r10)
- addi r10,r10,8
-
- .align 4
-4: bf 29,2f
- stw r4,0(r10)
- addi r10,r10,4
-
- .align 4
-2: bf 30,1f
- sth r4,0(r10)
- addi r10,r10,2
-
- .align 4
-1: bflr 31
- stb r4,0(r10)
- blr
-
- /* Handle short copies of 0~31 bytes. Best throughput is achieved
- by just unrolling all operations. */
- .align 4
-L(write_LT_32):
- cmpldi cr6,5,8
- mtocrf 0x01,r5
- ble cr6,L(write_LE_8)
-
- /* At least 9 bytes to go. */
- neg r8,r4
- andi. r0,r8,3
- cmpldi cr1,r5,16
- beq L(write_LT_32_aligned)
-
- /* Force 4-byte alignment for SRC. */
- mtocrf 0x01,r0
- subf r5,r0,r5
-
-2: bf 30,1f
- sth r4,0(r10)
- addi r10,r10,2
-
-1: bf 31,L(end_4bytes_alignment)
- stb r4,0(r10)
- addi r10,r10,1
-
- .align 4
-L(end_4bytes_alignment):
- cmpldi cr1,r5,16
- mtocrf 0x01,r5
-
-L(write_LT_32_aligned):
- blt cr1,8f
-
- stw r4,0(r10)
- stw r4,4(r10)
- stw r4,8(r10)
- stw r4,12(r10)
- addi r10,r10,16
-
-8: bf 28,L(tail4)
- stw r4,0(r10)
- stw r4,4(r10)
- addi r10,r10,8
-
- .align 4
- /* Copies 4~7 bytes. */
-L(tail4):
- bf 29,L(tail2)
- stw r4,0(r10)
- bf 30,L(tail5)
- sth r4,4(r10)
- bflr 31
- stb r4,6(r10)
- blr
-
- .align 4
- /* Copies 2~3 bytes. */
-L(tail2):
- bf 30,1f
- sth r4,0(r10)
- bflr 31
- stb r4,2(r10)
- blr
-
- .align 4
-L(tail5):
- bflr 31
- stb r4,4(r10)
- blr
-
- .align 4
-1: bflr 31
- stb r4,0(r10)
- blr
-
- /* Handles copies of 0~8 bytes. */
- .align 4
-L(write_LE_8):
- bne cr6,L(tail4)
-
- stw r4,0(r10)
- stw r4,4(r10)
- blr
-END_GEN_TB (MEMSET,TB_TOCLESS)
-libc_hidden_builtin_def (memset)
-
-/* Copied from bzero.S to prevent the linker from inserting a stub
- between bzero and memset. */
-ENTRY (__bzero)
- CALL_MCOUNT 3
- mr r5,r4
- li r4,0
- b L(_memset)
-END (__bzero)
-#ifndef __bzero
-weak_alias (__bzero, bzero)
-#endif
diff --git a/sysdeps/powerpc/powerpc64/power8/multiarch/Implies b/sysdeps/powerpc/powerpc64/power8/multiarch/Implies
deleted file mode 100644
index 1fc7b7cd39..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/multiarch/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power7/multiarch
diff --git a/sysdeps/powerpc/powerpc64/power8/stpcpy.S b/sysdeps/powerpc/powerpc64/power8/stpcpy.S
deleted file mode 100644
index 955e738cee..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/stpcpy.S
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Optimized stpcpy implementation for PowerPC64/POWER8.
- Copyright (C) 2015-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#define USE_AS_STPCPY
-#include <sysdeps/powerpc/powerpc64/power8/strcpy.S>
-
-weak_alias (__stpcpy, stpcpy)
-libc_hidden_def (__stpcpy)
-libc_hidden_builtin_def (stpcpy)
diff --git a/sysdeps/powerpc/powerpc64/power8/stpncpy.S b/sysdeps/powerpc/powerpc64/power8/stpncpy.S
deleted file mode 100644
index c14d984dd0..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/stpncpy.S
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Optimized stpncpy implementation for PowerPC64/POWER8.
- Copyright (C) 2015-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#define USE_AS_STPNCPY
-#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
-
-weak_alias (__stpncpy, stpncpy)
-libc_hidden_def (__stpncpy)
-libc_hidden_builtin_def (stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/power8/strcasecmp.S b/sysdeps/powerpc/powerpc64/power8/strcasecmp.S
deleted file mode 100644
index 88b17a6eb1..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/strcasecmp.S
+++ /dev/null
@@ -1,457 +0,0 @@
-/* Optimized strcasecmp implementation for PowerPC64.
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <locale-defines.h>
-
-/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) */
-
-#ifndef USE_AS_STRNCASECMP
-# define __STRCASECMP __strcasecmp
-# define STRCASECMP strcasecmp
-#else
-# define __STRCASECMP __strncasecmp
-# define STRCASECMP strncasecmp
-#endif
-/* Convert 16 bytes to lowercase and compare */
-#define TOLOWER() \
- vaddubm v8, v4, v1; \
- vaddubm v7, v4, v3; \
- vcmpgtub v8, v8, v2; \
- vsel v4, v7, v4, v8; \
- vaddubm v8, v5, v1; \
- vaddubm v7, v5, v3; \
- vcmpgtub v8, v8, v2; \
- vsel v5, v7, v5, v8; \
- vcmpequb. v7, v5, v4;
-
-/*
- * Get 16 bytes for unaligned case.
- * reg1: Vector to hold next 16 bytes.
- * reg2: Address to read from.
- * reg3: Permute control vector.
- * v8: Tmp vector used to mask unwanted bytes.
- * v9: Tmp vector,0 when null is found on first 16 bytes
- */
-#ifdef __LITTLE_ENDIAN__
-#define GET16BYTES(reg1, reg2, reg3) \
- lvx reg1, 0, reg2; \
- vspltisb v8, -1; \
- vperm v8, v8, reg1, reg3; \
- vcmpequb. v8, v0, v8; \
- beq cr6, 1f; \
- vspltisb v9, 0; \
- b 2f; \
- .align 4; \
-1: \
- addi r6, reg2, 16; \
- lvx v9, 0, r6; \
-2: \
- vperm reg1, v9, reg1, reg3;
-#else
-#define GET16BYTES(reg1, reg2, reg3) \
- lvx reg1, 0, reg2; \
- vspltisb v8, -1; \
- vperm v8, reg1, v8, reg3; \
- vcmpequb. v8, v0, v8; \
- beq cr6, 1f; \
- vspltisb v9, 0; \
- b 2f; \
- .align 4; \
-1: \
- addi r6, reg2, 16; \
- lvx v9, 0, r6; \
-2: \
- vperm reg1, reg1, v9, reg3;
-#endif
-
-/* Check null in v4, v5 and convert to lower. */
-#define CHECKNULLANDCONVERT() \
- vcmpequb. v7, v0, v5; \
- beq cr6, 3f; \
- vcmpequb. v7, v0, v4; \
- beq cr6, 3f; \
- b L(null_found); \
- .align 4; \
-3: \
- TOLOWER()
-
-#ifdef _ARCH_PWR8
-# define VCLZD_V8_v7 vclzd v8, v7;
-# define MFVRD_R3_V1 mfvrd r3, v1;
-# define VSUBUDM_V9_V8 vsubudm v9, v9, v8;
-# define VPOPCNTD_V8_V8 vpopcntd v8, v8;
-# define VADDUQM_V7_V8 vadduqm v9, v7, v8;
-#else
-# define VCLZD_V8_v7 .long 0x11003fc2
-# define MFVRD_R3_V1 .long 0x7c230067
-# define VSUBUDM_V9_V8 .long 0x112944c0
-# define VPOPCNTD_V8_V8 .long 0x110047c3
-# define VADDUQM_V7_V8 .long 0x11274100
-#endif
-
- .machine power7
-
-ENTRY (__STRCASECMP)
-#ifdef USE_AS_STRNCASECMP
- CALL_MCOUNT 3
-#else
- CALL_MCOUNT 2
-#endif
-#define rRTN r3 /* Return value */
-#define rSTR1 r10 /* 1st string */
-#define rSTR2 r4 /* 2nd string */
-#define rCHAR1 r6 /* Byte read from 1st string */
-#define rCHAR2 r7 /* Byte read from 2nd string */
-#define rADDR1 r8 /* Address of tolower(rCHAR1) */
-#define rADDR2 r12 /* Address of tolower(rCHAR2) */
-#define rLWR1 r8 /* Word tolower(rCHAR1) */
-#define rLWR2 r12 /* Word tolower(rCHAR2) */
-#define rTMP r9
-#define rLOC r11 /* Default locale address */
-
- cmpd cr7, rRTN, rSTR2
-
- /* Get locale address. */
- ld rTMP, __libc_tsd_LOCALE@got@tprel(r2)
- add rLOC, rTMP, __libc_tsd_LOCALE@tls
- ld rLOC, 0(rLOC)
-
- mr rSTR1, rRTN
- li rRTN, 0
- beqlr cr7
-#ifdef USE_AS_STRNCASECMP
- cmpdi cr7, r5, 0
- beq cr7, L(retnull)
- cmpdi cr7, r5, 16
- blt cr7, L(bytebybyte)
-#endif
- vspltisb v0, 0
- vspltisb v8, -1
- /* Check for null in initial characters.
- Check max of 16 char depending on the alignment.
- If null is present, proceed byte by byte. */
- lvx v4, 0, rSTR1
-#ifdef __LITTLE_ENDIAN__
- lvsr v10, 0, rSTR1 /* Compute mask. */
- vperm v9, v8, v4, v10 /* Mask bits that are not part of string. */
-#else
- lvsl v10, 0, rSTR1
- vperm v9, v4, v8, v10
-#endif
- vcmpequb. v9, v0, v9 /* Check for null bytes. */
- bne cr6, L(bytebybyte)
- lvx v5, 0, rSTR2
- /* Calculate alignment. */
-#ifdef __LITTLE_ENDIAN__
- lvsr v6, 0, rSTR2
- vperm v9, v8, v5, v6 /* Mask bits that are not part of string. */
-#else
- lvsl v6, 0, rSTR2
- vperm v9, v5, v8, v6
-#endif
- vcmpequb. v9, v0, v9 /* Check for null bytes. */
- bne cr6, L(bytebybyte)
- /* Check if locale has non ascii characters. */
- ld rTMP, 0(rLOC)
- addi r6, rTMP,LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
- lwz rTMP, 0(r6)
- cmpdi cr7, rTMP, 1
- beq cr7, L(bytebybyte)
-
- /* Load vector registers with values used for TOLOWER. */
- /* Load v1 = 0xbf, v2 = 0x19 v3 = 0x20 in each byte. */
- vspltisb v3, 2
- vspltisb v9, 4
- vsl v3, v3, v9
- vaddubm v1, v3, v3
- vnor v1, v1, v1
- vspltisb v2, 7
- vsububm v2, v3, v2
-
- andi. rADDR1, rSTR1, 0xF
- beq cr0, L(align)
- addi r6, rSTR1, 16
- lvx v9, 0, r6
- /* Compute 16 bytes from previous two loads. */
-#ifdef __LITTLE_ENDIAN__
- vperm v4, v9, v4, v10
-#else
- vperm v4, v4, v9, v10
-#endif
-L(align):
- andi. rADDR2, rSTR2, 0xF
- beq cr0, L(align1)
- addi r6, rSTR2, 16
- lvx v9, 0, r6
- /* Compute 16 bytes from previous two loads. */
-#ifdef __LITTLE_ENDIAN__
- vperm v5, v9, v5, v6
-#else
- vperm v5, v5, v9, v6
-#endif
-L(align1):
- CHECKNULLANDCONVERT()
- blt cr6, L(match)
- b L(different)
- .align 4
-L(match):
- clrldi r6, rSTR1, 60
- subfic r7, r6, 16
-#ifdef USE_AS_STRNCASECMP
- sub r5, r5, r7
-#endif
- add rSTR1, rSTR1, r7
- add rSTR2, rSTR2, r7
- andi. rADDR2, rSTR2, 0xF
- addi rSTR1, rSTR1, -16
- addi rSTR2, rSTR2, -16
- beq cr0, L(aligned)
-#ifdef __LITTLE_ENDIAN__
- lvsr v6, 0, rSTR2
-#else
- lvsl v6, 0, rSTR2
-#endif
- /* There are 2 loops depending on the input alignment.
- Each loop gets 16 bytes from s1 and s2, check for null,
- convert to lowercase and compare. Loop till difference
- or null occurs. */
-L(s1_align):
- addi rSTR1, rSTR1, 16
- addi rSTR2, rSTR2, 16
-#ifdef USE_AS_STRNCASECMP
- cmpdi cr7, r5, 16
- blt cr7, L(bytebybyte)
- addi r5, r5, -16
-#endif
- lvx v4, 0, rSTR1
- GET16BYTES(v5, rSTR2, v6)
- CHECKNULLANDCONVERT()
- blt cr6, L(s1_align)
- b L(different)
- .align 4
-L(aligned):
- addi rSTR1, rSTR1, 16
- addi rSTR2, rSTR2, 16
-#ifdef USE_AS_STRNCASECMP
- cmpdi cr7, r5, 16
- blt cr7, L(bytebybyte)
- addi r5, r5, -16
-#endif
- lvx v4, 0, rSTR1
- lvx v5, 0, rSTR2
- CHECKNULLANDCONVERT()
- blt cr6, L(aligned)
-
- /* Calculate and return the difference. */
-L(different):
- vaddubm v1, v3, v3
- vcmpequb v7, v0, v7
-#ifdef __LITTLE_ENDIAN__
- /* Count trailing zero. */
- vspltisb v8, -1
- VADDUQM_V7_V8
- vandc v8, v9, v7
- VPOPCNTD_V8_V8
- vspltb v6, v8, 15
- vcmpequb. v6, v6, v1
- blt cr6, L(shift8)
-#else
- /* Count leading zero. */
- VCLZD_V8_v7
- vspltb v6, v8, 7
- vcmpequb. v6, v6, v1
- blt cr6, L(shift8)
- vsro v8, v8, v1
-#endif
- b L(skipsum)
- .align 4
-L(shift8):
- vsumsws v8, v8, v0
-L(skipsum):
-#ifdef __LITTLE_ENDIAN__
- /* Shift registers based on leading zero count. */
- vsro v6, v5, v8
- vsro v7, v4, v8
- /* Merge and move to GPR. */
- vmrglb v6, v6, v7
- vslo v1, v6, v1
- MFVRD_R3_V1
- /* Place the characters that are different in first position. */
- sldi rSTR2, rRTN, 56
- srdi rSTR2, rSTR2, 56
- sldi rSTR1, rRTN, 48
- srdi rSTR1, rSTR1, 56
-#else
- vslo v6, v5, v8
- vslo v7, v4, v8
- vmrghb v1, v6, v7
- MFVRD_R3_V1
- srdi rSTR2, rRTN, 48
- sldi rSTR2, rSTR2, 56
- srdi rSTR2, rSTR2, 56
- srdi rSTR1, rRTN, 56
-#endif
- subf rRTN, rSTR1, rSTR2
- extsw rRTN, rRTN
- blr
-
- .align 4
- /* OK. We've hit the end of the string. We need to be careful that
- we don't compare two strings as different because of junk beyond
- the end of the strings... */
-L(null_found):
- vaddubm v10, v3, v3
-#ifdef __LITTLE_ENDIAN__
- /* Count trailing zero. */
- vspltisb v8, -1
- VADDUQM_V7_V8
- vandc v8, v9, v7
- VPOPCNTD_V8_V8
- vspltb v6, v8, 15
- vcmpequb. v6, v6, v10
- blt cr6, L(shift_8)
-#else
- /* Count leading zero. */
- VCLZD_V8_v7
- vspltb v6, v8, 7
- vcmpequb. v6, v6, v10
- blt cr6, L(shift_8)
- vsro v8, v8, v10
-#endif
- b L(skipsum1)
- .align 4
-L(shift_8):
- vsumsws v8, v8, v0
-L(skipsum1):
- /* Calculate shift count based on count of zero. */
- vspltisb v10, 7
- vslb v10, v10, v10
- vsldoi v9, v0, v10, 1
- VSUBUDM_V9_V8
- vspltisb v8, 8
- vsldoi v8, v0, v8, 1
- VSUBUDM_V9_V8
- /* Shift and remove junk after null character. */
-#ifdef __LITTLE_ENDIAN__
- vslo v5, v5, v9
- vslo v4, v4, v9
-#else
- vsro v5, v5, v9
- vsro v4, v4, v9
-#endif
- /* Convert and compare 16 bytes. */
- TOLOWER()
- blt cr6, L(retnull)
- b L(different)
- .align 4
-L(retnull):
- li rRTN, 0
- blr
- .align 4
-L(bytebybyte):
- /* Unrolling loop for POWER: loads are done with 'lbz' plus
- offset and string descriptors are only updated in the end
- of loop unrolling. */
- ld rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
- lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
- lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
-#ifdef USE_AS_STRNCASECMP
- rldicl rTMP, r5, 62, 2
- cmpdi cr7, rTMP, 0
- beq cr7, L(lessthan4)
- mtctr rTMP
-#endif
-L(loop):
- cmpdi rCHAR1, 0 /* *s1 == '\0' ? */
- sldi rADDR1, rCHAR1, 2 /* Calculate address for tolower(*s1) */
- sldi rADDR2, rCHAR2, 2 /* Calculate address for tolower(*s2) */
- lwzx rLWR1, rLOC, rADDR1 /* Load tolower(*s1) */
- lwzx rLWR2, rLOC, rADDR2 /* Load tolower(*s2) */
- cmpw cr1, rLWR1, rLWR2 /* r = tolower(*s1) == tolower(*s2) ? */
- crorc 4*cr1+eq,eq,4*cr1+eq /* (*s1 != '\0') || (r == 1) */
- beq cr1, L(done)
- lbz rCHAR1, 1(rSTR1)
- lbz rCHAR2, 1(rSTR2)
- cmpdi rCHAR1, 0
- sldi rADDR1, rCHAR1, 2
- sldi rADDR2, rCHAR2, 2
- lwzx rLWR1, rLOC, rADDR1
- lwzx rLWR2, rLOC, rADDR2
- cmpw cr1, rLWR1, rLWR2
- crorc 4*cr1+eq,eq,4*cr1+eq
- beq cr1, L(done)
- lbz rCHAR1, 2(rSTR1)
- lbz rCHAR2, 2(rSTR2)
- cmpdi rCHAR1, 0
- sldi rADDR1, rCHAR1, 2
- sldi rADDR2, rCHAR2, 2
- lwzx rLWR1, rLOC, rADDR1
- lwzx rLWR2, rLOC, rADDR2
- cmpw cr1, rLWR1, rLWR2
- crorc 4*cr1+eq,eq,4*cr1+eq
- beq cr1, L(done)
- lbz rCHAR1, 3(rSTR1)
- lbz rCHAR2, 3(rSTR2)
- cmpdi rCHAR1, 0
- /* Increment both string descriptors */
- addi rSTR1, rSTR1, 4
- addi rSTR2, rSTR2, 4
- sldi rADDR1, rCHAR1, 2
- sldi rADDR2, rCHAR2, 2
- lwzx rLWR1, rLOC, rADDR1
- lwzx rLWR2, rLOC, rADDR2
- cmpw cr1, rLWR1, rLWR2
- crorc 4*cr1+eq,eq,4*cr1+eq
- beq cr1, L(done)
- lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
- lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
-#ifdef USE_AS_STRNCASECMP
- bdnz L(loop)
-#else
- b L(loop)
-#endif
-#ifdef USE_AS_STRNCASECMP
-L(lessthan4):
- clrldi r5, r5, 62
- cmpdi cr7, r5, 0
- beq cr7, L(retnull)
- mtctr r5
-L(loop1):
- cmpdi rCHAR1, 0
- sldi rADDR1, rCHAR1, 2
- sldi rADDR2, rCHAR2, 2
- lwzx rLWR1, rLOC, rADDR1
- lwzx rLWR2, rLOC, rADDR2
- cmpw cr1, rLWR1, rLWR2
- crorc 4*cr1+eq,eq,4*cr1+eq
- beq cr1, L(done)
- addi rSTR1, rSTR1, 1
- addi rSTR2, rSTR2, 1
- lbz rCHAR1, 0(rSTR1)
- lbz rCHAR2, 0(rSTR2)
- bdnz L(loop1)
-#endif
-L(done):
- subf r0, rLWR2, rLWR1
- extsw rRTN, r0
- blr
-END (__STRCASECMP)
-
-weak_alias (__STRCASECMP, STRCASECMP)
-libc_hidden_builtin_def (__STRCASECMP)
diff --git a/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c b/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c
deleted file mode 100644
index 0e746b7718..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Optimized strcasestr implementation for PowerPC64/POWER8.
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <string.h>
-
-#define STRCASESTR __strcasestr_ppc
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(__name)
-
-#undef weak_alias
-#define weak_alias(a,b)
-extern __typeof (strcasestr) __strcasestr_ppc attribute_hidden;
-
-#include <string/strcasestr.c>
diff --git a/sysdeps/powerpc/powerpc64/power8/strcasestr.S b/sysdeps/powerpc/powerpc64/power8/strcasestr.S
deleted file mode 100644
index 6ac6572f3b..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/strcasestr.S
+++ /dev/null
@@ -1,538 +0,0 @@
-/* Optimized strcasestr implementation for PowerPC64/POWER8.
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <locale-defines.h>
-
-/* Char * [r3] strcasestr (char *s [r3], char * pat[r4]) */
-
-/* The performance gain is obtained by comparing 16 bytes. */
-
-/* When the first char of r4 is hit ITERATIONS times in r3
- fallback to default. */
-#define ITERATIONS 64
-
-#ifndef STRCASESTR
-# define STRCASESTR __strcasestr
-#endif
-
-#ifndef STRLEN
-/* For builds without IFUNC support, local calls should be made to internal
- GLIBC symbol (created by libc_hidden_builtin_def). */
-# ifdef SHARED
-# define STRLEN __GI_strlen
-# else
-# define STRLEN strlen
-# endif
-#endif
-
-#ifndef STRNLEN
-/* For builds without IFUNC support, local calls should be made to internal
- GLIBC symbol (created by libc_hidden_builtin_def). */
-# ifdef SHARED
-# define STRNLEN __GI_strnlen
-# else
-# define STRNLEN __strnlen
-# endif
-#endif
-
-#ifndef STRCHR
-# ifdef SHARED
-# define STRCHR __GI_strchr
-# else
-# define STRCHR strchr
-# endif
-#endif
-
-/* Convert 16 bytes of v4 and reg to lowercase and compare. */
-#define TOLOWER(reg) \
- vcmpgtub v6, v4, v1; \
- vcmpgtub v7, v2, v4; \
- vand v8, v7, v6; \
- vand v8, v8, v3; \
- vor v4, v8, v4; \
- vcmpgtub v6, reg, v1; \
- vcmpgtub v7, v2, reg; \
- vand v8, v7, v6; \
- vand v8, v8, v3; \
- vor reg, v8, reg; \
- vcmpequb. v6, reg, v4;
-
-/* TODO: change these to the actual instructions when the minimum required
- binutils allows it. */
-#ifdef _ARCH_PWR8
-#define VCLZD_V8_v7 vclzd v8, v7;
-#else
-#define VCLZD_V8_v7 .long 0x11003fc2
-#endif
-
-#define FRAMESIZE (FRAME_MIN_SIZE+48)
-/* TODO: change this to .machine power8 when the minimum required binutils
- allows it. */
- .machine power7
-EALIGN (STRCASESTR, 4, 0)
- CALL_MCOUNT 2
- mflr r0 /* Load link register LR to r0. */
- std r31, -8(r1) /* Save callers register r31. */
- std r30, -16(r1) /* Save callers register r30. */
- std r29, -24(r1) /* Save callers register r29. */
- std r28, -32(r1) /* Save callers register r28. */
- std r27, -40(r1) /* Save callers register r27. */
- std r0, 16(r1) /* Store the link register. */
- cfi_offset(r31, -8)
- cfi_offset(r30, -16)
- cfi_offset(r29, -24)
- cfi_offset(r28, -32)
- cfi_offset(r27, -40)
- cfi_offset(lr, 16)
- stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */
- cfi_adjust_cfa_offset(FRAMESIZE)
-
- dcbt 0, r3
- dcbt 0, r4
- cmpdi cr7, r3, 0 /* Input validation. */
- beq cr7, L(retnull)
- cmpdi cr7, r4, 0
- beq cr7, L(retnull)
-
- mr r29, r3
- mr r30, r4
- /* Load first byte from r4 and check if its null. */
- lbz r6, 0(r4)
- cmpdi cr7, r6, 0
- beq cr7, L(ret_r3)
-
- ld r10, __libc_tsd_LOCALE@got@tprel(r2)
- add r9, r10, __libc_tsd_LOCALE@tls
- ld r9, 0(r9)
- ld r9, LOCALE_CTYPE_TOUPPER(r9)
- sldi r10, r6, 2 /* Convert to upper case. */
- lwzx r28, r9, r10
-
- ld r10, __libc_tsd_LOCALE@got@tprel(r2)
- add r11, r10, __libc_tsd_LOCALE@tls
- ld r11, 0(r11)
- ld r11, LOCALE_CTYPE_TOLOWER(r11)
- sldi r10, r6, 2 /* Convert to lower case. */
- lwzx r27, r11, r10
-
- /* Check if the first char is present. */
- mr r4, r27
- bl STRCHR
- nop
- mr r5, r3
- mr r3, r29
- mr r29, r5
- mr r4, r28
- bl STRCHR
- nop
- cmpdi cr7, r29, 0
- beq cr7, L(firstpos)
- cmpdi cr7, r3, 0
- beq cr7, L(skipcheck)
- cmpw cr7, r3, r29
- ble cr7, L(firstpos)
- /* Move r3 to the first occurence. */
-L(skipcheck):
- mr r3, r29
-L(firstpos):
- mr r29, r3
-
- sldi r9, r27, 8
- or r28, r9, r28
- /* Reg r27 is used to count the number of iterations. */
- li r27, 0
- /* If first char of search str is not present. */
- cmpdi cr7, r3, 0
- ble cr7, L(end)
-
- /* Find the length of pattern. */
- mr r3, r30
- bl STRLEN
- nop
-
- cmpdi cr7, r3, 0 /* If search str is null. */
- beq cr7, L(ret_r3)
-
- mr r31, r3
- mr r4, r3
- mr r3, r29
- bl STRNLEN
- nop
-
- cmpd cr7, r3, r31 /* If len(r3) < len(r4). */
- blt cr7, L(retnull)
-
- mr r3, r29
-
- /* Locales not matching ASCII for single bytes. */
- ld r10, __libc_tsd_LOCALE@got@tprel(r2)
- add r9, r10, __libc_tsd_LOCALE@tls
- ld r9, 0(r9)
- ld r7, 0(r9)
- addi r7, r7, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
- lwz r8, 0(r7)
- cmpdi cr7, r8, 1
- beq cr7, L(bytebybyte)
-
- /* If len(r4) < 16 handle byte by byte. */
- /* For shorter strings we will not use vector registers. */
- cmpdi cr7, r31, 16
- blt cr7, L(bytebybyte)
-
- /* Comparison values used for TOLOWER. */
- /* Load v1 = 64('A' - 1), v2 = 91('Z' + 1), v3 = 32 in each byte. */
- vspltish v0, 0
- vspltisb v5, 2
- vspltisb v4, 4
- vsl v3, v5, v4
- vaddubm v1, v3, v3
- vspltisb v5, 15
- vaddubm v2, v5, v5
- vaddubm v2, v1, v2
- vspltisb v4, -3
- vaddubm v2, v2, v4
-
- /*
- 1. Load 16 bytes from r3 and r4
- 2. Check if there is null, If yes, proceed byte by byte path.
- 3. Else,Convert both to lowercase and compare.
- 4. If they are same proceed to 1.
- 5. If they dont match, find if first char of r4 is present in the
- loaded 16 byte of r3.
- 6. If yes, move position, load next 16 bytes of r3 and proceed to 2.
- */
-
- mr r8, r3 /* Save r3 for future use. */
- mr r4, r30 /* Restore r4. */
- clrldi r10, r4, 60
- lvx v5, 0, r4 /* Load 16 bytes from r4. */
- cmpdi cr7, r10, 0
- beq cr7, L(begin2)
- /* If r4 is unaligned, load another 16 bytes. */
-#ifdef __LITTLE_ENDIAN__
- lvsr v7, 0, r4
-#else
- lvsl v7, 0, r4
-#endif
- addi r5, r4, 16
- lvx v9, 0, r5
-#ifdef __LITTLE_ENDIAN__
- vperm v5, v9, v5, v7
-#else
- vperm v5, v5, v9, v7
-#endif
-L(begin2):
- lvx v4, 0, r3
- vcmpequb. v7, v0, v4 /* Check for null. */
- beq cr6, L(nullchk6)
- b L(trailcheck)
-
- .align 4
-L(nullchk6):
- clrldi r10, r3, 60
- cmpdi cr7, r10, 0
- beq cr7, L(next16)
-#ifdef __LITTLE_ENDIAN__
- lvsr v7, 0, r3
-#else
- lvsl v7, 0, r3
-#endif
- addi r5, r3, 16
- /* If r3 is unaligned, load another 16 bytes. */
- lvx v10, 0, r5
-#ifdef __LITTLE_ENDIAN__
- vperm v4, v10, v4, v7
-#else
- vperm v4, v4, v10, v7
-#endif
-L(next16):
- vcmpequb. v6, v0, v5 /* Check for null. */
- beq cr6, L(nullchk)
- b L(trailcheck)
-
- .align 4
-L(nullchk):
- vcmpequb. v6, v0, v4
- beq cr6, L(nullchk1)
- b L(retnull)
-
- .align 4
-L(nullchk1):
- /* Convert both v3 and v4 to lower. */
- TOLOWER(v5)
- /* If both are same, branch to match. */
- blt cr6, L(match)
- /* Find if the first char is present in next 15 bytes. */
-#ifdef __LITTLE_ENDIAN__
- vspltb v6, v5, 15
- vsldoi v7, v0, v4, 15
-#else
- vspltb v6, v5, 0
- vspltisb v7, 8
- vslo v7, v4, v7
-#endif
- vcmpequb v7, v6, v7
- vcmpequb. v6, v0, v7
- /* Shift r3 by 16 bytes and proceed. */
- blt cr6, L(shift16)
- VCLZD_V8_v7
-#ifdef __LITTLE_ENDIAN__
- vspltb v6, v8, 15
-#else
- vspltb v6, v8, 7
-#endif
- vcmpequb. v6, v6, v1
- /* Shift r3 by 8 bytes and proceed. */
- blt cr6, L(shift8)
- b L(begin)
-
- .align 4
-L(match):
- /* There is a match of 16 bytes, check next bytes. */
- cmpdi cr7, r31, 16
- mr r29, r3
- beq cr7, L(ret_r3)
-
-L(secondmatch):
- addi r3, r3, 16
- addi r4, r4, 16
- /* Load next 16 bytes of r3 and r4 and compare. */
- clrldi r10, r4, 60
- cmpdi cr7, r10, 0
- beq cr7, L(nextload)
- /* Handle unaligned case. */
- vor v6, v9, v9
- vcmpequb. v7, v0, v6
- beq cr6, L(nullchk2)
- b L(trailcheck)
-
- .align 4
-L(nullchk2):
-#ifdef __LITTLE_ENDIAN__
- lvsr v7, 0, r4
-#else
- lvsl v7, 0, r4
-#endif
- addi r5, r4, 16
- /* If r4 is unaligned, load another 16 bytes. */
- lvx v9, 0, r5
-#ifdef __LITTLE_ENDIAN__
- vperm v11, v9, v6, v7
-#else
- vperm v11, v6, v9, v7
-#endif
- b L(compare)
-
- .align 4
-L(nextload):
- lvx v11, 0, r4
-L(compare):
- vcmpequb. v7, v0, v11
- beq cr6, L(nullchk3)
- b L(trailcheck)
-
- .align 4
-L(nullchk3):
- clrldi r10, r3, 60
- cmpdi cr7, r10, 0
- beq cr7, L(nextload1)
- /* Handle unaligned case. */
- vor v4, v10, v10
- vcmpequb. v7, v0, v4
- beq cr6, L(nullchk4)
- b L(retnull)
-
- .align 4
-L(nullchk4):
-#ifdef __LITTLE_ENDIAN__
- lvsr v7, 0, r3
-#else
- lvsl v7, 0, r3
-#endif
- addi r5, r3, 16
- /* If r3 is unaligned, load another 16 bytes. */
- lvx v10, 0, r5
-#ifdef __LITTLE_ENDIAN__
- vperm v4, v10, v4, v7
-#else
- vperm v4, v4, v10, v7
-#endif
- b L(compare1)
-
- .align 4
-L(nextload1):
- lvx v4, 0, r3
-L(compare1):
- vcmpequb. v7, v0, v4
- beq cr6, L(nullchk5)
- b L(retnull)
-
- .align 4
-L(nullchk5):
- /* Convert both v3 and v4 to lower. */
- TOLOWER(v11)
- /* If both are same, branch to secondmatch. */
- blt cr6, L(secondmatch)
- /* Continue the search. */
- b L(begin)
-
- .align 4
-L(trailcheck):
- ld r10, __libc_tsd_LOCALE@got@tprel(r2)
- add r11, r10, __libc_tsd_LOCALE@tls
- ld r11, 0(r11)
- ld r11, LOCALE_CTYPE_TOLOWER(r11)
-L(loop2):
- lbz r5, 0(r3) /* Load byte from r3. */
- lbz r6, 0(r4) /* Load next byte from r4. */
- cmpdi cr7, r6, 0 /* Is it null? */
- beq cr7, L(updater3)
- cmpdi cr7, r5, 0 /* Is it null? */
- beq cr7, L(retnull) /* If yes, return. */
- addi r3, r3, 1
- addi r4, r4, 1 /* Increment r4. */
- sldi r10, r5, 2 /* Convert to lower case. */
- lwzx r10, r11, r10
- sldi r7, r6, 2 /* Convert to lower case. */
- lwzx r7, r11, r7
- cmpw cr7, r7, r10 /* Compare with byte from r4. */
- bne cr7, L(begin)
- b L(loop2)
-
- .align 4
-L(shift8):
- addi r8, r8, 7
- b L(begin)
- .align 4
-L(shift16):
- addi r8, r8, 15
- .align 4
-L(begin):
- addi r8, r8, 1
- mr r3, r8
- /* When our iterations exceed ITERATIONS,fall back to default. */
- addi r27, r27, 1
- cmpdi cr7, r27, ITERATIONS
- beq cr7, L(default)
- mr r4, r30 /* Restore r4. */
- b L(begin2)
-
- /* Handling byte by byte. */
- .align 4
-L(loop1):
- mr r3, r8
- addi r27, r27, 1
- cmpdi cr7, r27, ITERATIONS
- beq cr7, L(default)
- mr r29, r8
- srdi r4, r28, 8
- /* Check if the first char is present. */
- bl STRCHR
- nop
- mr r5, r3
- mr r3, r29
- mr r29, r5
- sldi r4, r28, 56
- srdi r4, r4, 56
- bl STRCHR
- nop
- cmpdi cr7, r29, 0
- beq cr7, L(nextpos)
- cmpdi cr7, r3, 0
- beq cr7, L(skipcheck1)
- cmpw cr7, r3, r29
- ble cr7, L(nextpos)
- /* Move r3 to first occurence. */
-L(skipcheck1):
- mr r3, r29
-L(nextpos):
- mr r29, r3
- cmpdi cr7, r3, 0
- ble cr7, L(retnull)
-L(bytebybyte):
- ld r10, __libc_tsd_LOCALE@got@tprel(r2)
- add r11, r10, __libc_tsd_LOCALE@tls
- ld r11, 0(r11)
- ld r11, LOCALE_CTYPE_TOLOWER(r11)
- mr r4, r30 /* Restore r4. */
- mr r8, r3 /* Save r3. */
- addi r8, r8, 1
-
-L(loop):
- addi r3, r3, 1
- lbz r5, 0(r3) /* Load byte from r3. */
- addi r4, r4, 1 /* Increment r4. */
- lbz r6, 0(r4) /* Load next byte from r4. */
- cmpdi cr7, r6, 0 /* Is it null? */
- beq cr7, L(updater3)
- cmpdi cr7, r5, 0 /* Is it null? */
- beq cr7, L(retnull) /* If yes, return. */
- sldi r10, r5, 2 /* Convert to lower case. */
- lwzx r10, r11, r10
- sldi r7, r6, 2 /* Convert to lower case. */
- lwzx r7, r11, r7
- cmpw cr7, r7, r10 /* Compare with byte from r4. */
- bne cr7, L(loop1)
- b L(loop)
-
- /* Handling return values. */
- .align 4
-L(updater3):
- subf r3, r31, r3 /* Reduce r31 (len of r4) from r3. */
- b L(end)
-
- .align 4
-L(ret_r3):
- mr r3, r29 /* Return point of match. */
- b L(end)
-
- .align 4
-L(retnull):
- li r3, 0 /* Substring was not found. */
- b L(end)
-
- .align 4
-L(default):
- mr r4, r30
- bl __strcasestr_ppc
- nop
-
- .align 4
-L(end):
- addi r1, r1, FRAMESIZE /* Restore stack pointer. */
- cfi_adjust_cfa_offset(-FRAMESIZE)
- ld r0, 16(r1) /* Restore the saved link register. */
- ld r27, -40(r1)
- ld r28, -32(r1)
- ld r29, -24(r1) /* Restore callers save register r29. */
- ld r30, -16(r1) /* Restore callers save register r30. */
- ld r31, -8(r1) /* Restore callers save register r31. */
- cfi_restore(lr)
- cfi_restore(r27)
- cfi_restore(r28)
- cfi_restore(r29)
- cfi_restore(r30)
- cfi_restore(r31)
- mtlr r0 /* Branch to link register. */
- blr
-END (STRCASESTR)
-
-weak_alias (__strcasestr, strcasestr)
-libc_hidden_def (__strcasestr)
-libc_hidden_builtin_def (strcasestr)
diff --git a/sysdeps/powerpc/powerpc64/power8/strchr.S b/sysdeps/powerpc/powerpc64/power8/strchr.S
deleted file mode 100644
index e0c185c162..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/strchr.S
+++ /dev/null
@@ -1,377 +0,0 @@
-/* Optimized strchr implementation for PowerPC64/POWER8.
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#ifdef USE_AS_STRCHRNUL
-# ifndef STRCHRNUL
-# define FUNC_NAME __strchrnul
-# else
-# define FUNC_NAME STRCHRNUL
-# endif
-#else
-# ifndef STRCHR
-# define FUNC_NAME strchr
-# else
-# define FUNC_NAME STRCHR
-# endif
-#endif /* !USE_AS_STRCHRNUL */
-
-/* int [r3] strchr (char *s [r3], int c [r4]) */
-/* TODO: change these to the actual instructions when the minimum required
- binutils allows it. */
-#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
-#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
-#define VBPERMQ(t,a,b) .long (0x1000054c \
- | ((t)<<(32-11)) \
- | ((a)<<(32-16)) \
- | ((b)<<(32-21)) )
-/* TODO: change this to .machine power8 when the minimum required binutils
- allows it. */
- .machine power7
-ENTRY (FUNC_NAME)
- CALL_MCOUNT 2
- dcbt 0,r3
- clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
- cmpdi cr7,r4,0
- ld r12,0(r8) /* Load doubleword from memory. */
- li r0,0 /* Doubleword with null chars to use
- with cmpb. */
-
- rlwinm r6,r3,3,26,28 /* Calculate padding. */
-
- beq cr7,L(null_match)
-
- /* Replicate byte to doubleword. */
- insrdi r4,r4,8,48
- insrdi r4,r4,16,32
- insrdi r4,r4,32,0
-
- /* Now r4 has a doubleword of c bytes and r0 has
- a doubleword of null bytes. */
-
- cmpb r10,r12,r4 /* Compare each byte against c byte. */
- cmpb r11,r12,r0 /* Compare each byte against null byte. */
-
- /* Move the doublewords left and right to discard the bits that are
- not part of the string and bring them back as zeros. */
-#ifdef __LITTLE_ENDIAN__
- srd r10,r10,r6
- srd r11,r11,r6
- sld r10,r10,r6
- sld r11,r11,r6
-#else
- sld r10,r10,r6
- sld r11,r11,r6
- srd r10,r10,r6
- srd r11,r11,r6
-#endif
- or r5,r10,r11 /* OR the results to speed things up. */
- cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
- have been found. */
- bne cr7,L(done)
-
- mtcrf 0x01,r8
-
- /* Are we now aligned to a doubleword boundary? If so, skip to
- the main loop. Otherwise, go through the alignment code. */
-
- bt 28,L(loop)
-
- /* Handle WORD2 of pair. */
- ldu r12,8(r8)
- cmpb r10,r12,r4
- cmpb r11,r12,r0
- or r5,r10,r11
- cmpdi cr7,r5,0
- bne cr7,L(done)
- b L(loop) /* We branch here (rather than falling through)
- to skip the nops due to heavy alignment
- of the loop below. */
-
- .p2align 5
-L(loop):
- /* Load two doublewords, compare and merge in a
- single register for speed. This is an attempt
- to speed up the null-checking process for bigger strings. */
- ld r12,8(r8)
- ldu r9,16(r8)
- cmpb r10,r12,r4
- cmpb r11,r12,r0
- cmpb r6,r9,r4
- cmpb r7,r9,r0
- or r5,r10,r11
- or r9,r6,r7
- or r12,r5,r9
- cmpdi cr7,r12,0
- beq cr7,L(vector)
- /* OK, one (or both) of the doublewords contains a c/null byte. Check
- the first doubleword and decrement the address in case the first
- doubleword really contains a c/null byte. */
-
- cmpdi cr6,r5,0
- addi r8,r8,-8
- bne cr6,L(done)
-
- /* The c/null byte must be in the second doubleword. Adjust the
- address again and move the result of cmpb to r10 so we can calculate
- the pointer. */
-
- mr r10,r6
- mr r11,r7
- addi r8,r8,8
-#ifdef USE_AS_STRCHRNUL
- mr r5, r9
-#endif
- /* r10/r11 have the output of the cmpb instructions, that is,
- 0xff in the same position as the c/null byte in the original
- doubleword from the string. Use that to calculate the pointer. */
-L(done):
-#ifdef USE_AS_STRCHRNUL
- mr r10, r5
-#endif
-#ifdef __LITTLE_ENDIAN__
- addi r3,r10,-1
- andc r3,r3,r10
- popcntd r0,r3
-# ifndef USE_AS_STRCHRNUL
- addi r4,r11,-1
- andc r4,r4,r11
- cmpld cr7,r3,r4
- bgt cr7,L(no_match)
-# endif
-#else
- cntlzd r0,r10 /* Count leading zeros before c matches. */
-# ifndef USE_AS_STRCHRNUL
- cmpld cr7,r11,r10
- bgt cr7,L(no_match)
-# endif
-#endif
- srdi r0,r0,3 /* Convert leading zeros to bytes. */
- add r3,r8,r0 /* Return address of the matching c byte
- or null in case c was not found. */
- blr
-
- /* Check the first 32B in GPR's and move to vectorized loop. */
- .p2align 5
-L(vector):
- addi r3, r8, 8
- andi. r10, r3, 31
- bne cr0, L(loop)
- vspltisb v0, 0
- /* Precompute vbpermq constant. */
- vspltisb v10, 3
- lvsl v11, r0, r0
- vslb v10, v11, v10
- MTVRD(v1,r4)
- li r5, 16
- vspltb v1, v1, 7
- /* Compare 32 bytes in each loop. */
-L(continue):
- lvx v4, 0, r3
- lvx v5, r3, r5
- vcmpequb v2, v0, v4
- vcmpequb v3, v0, v5
- vcmpequb v6, v1, v4
- vcmpequb v7, v1, v5
- vor v8, v2, v3
- vor v9, v6, v7
- vor v11, v8, v9
- vcmpequb. v11, v0, v11
- addi r3, r3, 32
- blt cr6, L(continue)
- /* One (or both) of the quadwords contains a c/null byte. */
- addi r3, r3, -32
-#ifndef USE_AS_STRCHRNUL
- vcmpequb. v11, v0, v9
- blt cr6, L(no_match)
-#endif
- /* Permute the first bit of each byte into bits 48-63. */
- VBPERMQ(v2, v2, v10)
- VBPERMQ(v3, v3, v10)
- VBPERMQ(v6, v6, v10)
- VBPERMQ(v7, v7, v10)
- /* Shift each component into its correct position for merging. */
-#ifdef __LITTLE_ENDIAN__
- vsldoi v3, v3, v3, 2
- vsldoi v7, v7, v7, 2
-#else
- vsldoi v2, v2, v2, 6
- vsldoi v3, v3, v3, 4
- vsldoi v6, v6, v6, 6
- vsldoi v7, v7, v7, 4
-#endif
-
- /* Merge the results and move to a GPR. */
- vor v1, v3, v2
- vor v2, v6, v7
- vor v4, v1, v2
- MFVRD(r5, v4)
-#ifdef __LITTLE_ENDIAN__
- addi r6, r5, -1
- andc r6, r6, r5
- popcntd r6, r6
-#else
- cntlzd r6, r5 /* Count leading zeros before the match. */
-#endif
- add r3, r3, r6 /* Compute final length. */
- /* Return NULL if null found before c. */
-#ifndef USE_AS_STRCHRNUL
- lbz r4, 0(r3)
- cmpdi cr7, r4, 0
- beq cr7, L(no_match)
-#endif
- blr
-
-#ifndef USE_AS_STRCHRNUL
- .align 4
-L(no_match):
- li r3,0
- blr
-#endif
-
-/* We are here because strchr was called with a null byte. */
- .align 4
-L(null_match):
- /* r0 has a doubleword of null bytes. */
-
- cmpb r5,r12,r0 /* Compare each byte against null bytes. */
-
- /* Move the doublewords left and right to discard the bits that are
- not part of the string and bring them back as zeros. */
-#ifdef __LITTLE_ENDIAN__
- srd r5,r5,r6
- sld r5,r5,r6
-#else
- sld r5,r5,r6
- srd r5,r5,r6
-#endif
- cmpdi cr7,r5,0 /* If r10 == 0, no c or null bytes
- have been found. */
- bne cr7,L(done_null)
-
- mtcrf 0x01,r8
-
- /* Are we now aligned to a quadword boundary? If so, skip to
- the main loop. Otherwise, go through the alignment code. */
-
- bt 28,L(loop_null)
-
- /* Handle WORD2 of pair. */
- ldu r12,8(r8)
- cmpb r5,r12,r0
- cmpdi cr7,r5,0
- bne cr7,L(done_null)
- b L(loop_null) /* We branch here (rather than falling through)
- to skip the nops due to heavy alignment
- of the loop below. */
-
- /* Main loop to look for the end of the string. Since it's a
- small loop (< 8 instructions), align it to 32-bytes. */
- .p2align 5
-L(loop_null):
- /* Load two doublewords, compare and merge in a
- single register for speed. This is an attempt
- to speed up the null-checking process for bigger strings. */
- ld r12,8(r8)
- ldu r11,16(r8)
- cmpb r5,r12,r0
- cmpb r10,r11,r0
- or r6,r5,r10
- cmpdi cr7,r6,0
- beq cr7,L(vector1)
-
- /* OK, one (or both) of the doublewords contains a null byte. Check
- the first doubleword and decrement the address in case the first
- doubleword really contains a null byte. */
-
- cmpdi cr6,r5,0
- addi r8,r8,-8
- bne cr6,L(done_null)
-
- /* The null byte must be in the second doubleword. Adjust the address
- again and move the result of cmpb to r10 so we can calculate the
- pointer. */
-
- mr r5,r10
- addi r8,r8,8
-
- /* r5 has the output of the cmpb instruction, that is, it contains
- 0xff in the same position as the null byte in the original
- doubleword from the string. Use that to calculate the pointer. */
-L(done_null):
-#ifdef __LITTLE_ENDIAN__
- addi r0,r5,-1
- andc r0,r0,r5
- popcntd r0,r0
-#else
- cntlzd r0,r5 /* Count leading zeros before the match. */
-#endif
- srdi r0,r0,3 /* Convert leading zeros to bytes. */
- add r3,r8,r0 /* Return address of the matching null byte. */
- blr
- .p2align 5
-L(vector1):
- addi r3, r8, 8
- andi. r10, r3, 31
- bne cr0, L(loop_null)
- vspltisb v8, -1
- vspltisb v0, 0
- vspltisb v10, 3
- lvsl v11, r0, r0
- vslb v10, v11, v10
- li r5, 16
-L(continue1):
- lvx v4, 0, r3
- lvx v5, r3, r5
- vcmpequb v2, v0, v4
- vcmpequb v3, v0, v5
- vor v8, v2, v3
- vcmpequb. v11, v0, v8
- addi r3, r3, 32
- blt cr6, L(continue1)
- addi r3, r3, -32
-L(end1):
- VBPERMQ(v2, v2, v10)
- VBPERMQ(v3, v3, v10)
- /* Shift each component into its correct position for merging. */
-#ifdef __LITTLE_ENDIAN__
- vsldoi v3, v3, v3, 2
-#else
- vsldoi v2, v2, v2, 6
- vsldoi v3, v3, v3, 4
-#endif
-
- /* Merge the results and move to a GPR. */
- vor v4, v3, v2
- MFVRD(r5, v4)
-#ifdef __LITTLE_ENDIAN__
- addi r6, r5, -1
- andc r6, r6, r5
- popcntd r6, r6
-#else
- cntlzd r6, r5 /* Count leading zeros before the match. */
-#endif
- add r3, r3, r6 /* Compute final length. */
- blr
-END (FUNC_NAME)
-
-#ifndef USE_AS_STRCHRNUL
-weak_alias (strchr, index)
-libc_hidden_builtin_def (strchr)
-#endif
diff --git a/sysdeps/powerpc/powerpc64/power8/strchrnul.S b/sysdeps/powerpc/powerpc64/power8/strchrnul.S
deleted file mode 100644
index 3bf4b275dd..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/strchrnul.S
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Optimized strchrnul implementation for PowerPC64/POWER8.
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#define USE_AS_STRCHRNUL 1
-#include <sysdeps/powerpc/powerpc64/power8/strchr.S>
-
-weak_alias (__strchrnul,strchrnul)
-libc_hidden_builtin_def (__strchrnul)
diff --git a/sysdeps/powerpc/powerpc64/power8/strcmp.S b/sysdeps/powerpc/powerpc64/power8/strcmp.S
deleted file mode 100644
index 770484f1e1..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/strcmp.S
+++ /dev/null
@@ -1,247 +0,0 @@
-/* Optimized strcmp implementation for PowerPC64/POWER8.
- Copyright (C) 2015-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#ifndef STRCMP
-# define STRCMP strcmp
-#endif
-
-/* Implements the function
-
- size_t [r3] strcmp (const char *s1 [r3], const char *s2 [r4])
-
- The implementation uses unaligned doubleword access to avoid specialized
- code paths depending of data alignment. Although recent powerpc64 uses
- 64K as default, the page cross handling assumes minimum page size of
- 4k. */
-
-EALIGN (STRCMP, 4, 0)
- li r0,0
-
- /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
- the code:
-
- (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
-
- with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */
-
- rldicl r7,r3,0,52
- rldicl r9,r4,0,52
- cmpldi cr7,r7,4096-16
- bgt cr7,L(pagecross_check)
- cmpldi cr5,r9,4096-16
- bgt cr5,L(pagecross_check)
-
- /* For short string up to 16 bytes, load both s1 and s2 using
- unaligned dwords and compare. */
- ld r8,0(r3)
- ld r10,0(r4)
- cmpb r12,r8,r0
- cmpb r11,r8,r10
- orc. r9,r12,r11
- bne cr0,L(different_nocmpb)
-
- ld r8,8(r3)
- ld r10,8(r4)
- cmpb r12,r8,r0
- cmpb r11,r8,r10
- orc. r9,r12,r11
- bne cr0,L(different_nocmpb)
-
- addi r7,r3,16
- addi r4,r4,16
-
-L(align_8b):
- /* Now it has checked for first 16 bytes, align source1 to doubleword
- and adjust source2 address. */
- rldicl r9,r7,0,61 /* source1 alignment to doubleword */
- subf r4,r9,r4 /* Adjust source2 address based on source1
- alignment. */
- rldicr r7,r7,0,60 /* Align source1 to doubleword. */
-
- /* At this point, source1 alignment is 0 and source2 alignment is
- between 0 and 7. Check is source2 alignment is 0, meaning both
- sources have the same alignment. */
- andi. r9,r4,0x7
- bne cr0,L(loop_diff_align)
-
- /* If both source1 and source2 are doubleword aligned, there is no
- need for page boundary cross checks. */
-
- ld r8,0(r7)
- ld r10,0(r4)
- cmpb r12,r8,r0
- cmpb r11,r8,r10
- orc. r9,r12,r11
- bne cr0,L(different_nocmpb)
-
- .align 4
-L(loop_equal_align):
- ld r8,8(r7)
- ld r10,8(r4)
- cmpb r12,r8,r0
- cmpb r11,r8,r10
- orc. r9,r12,r11
- bne cr0,L(different_nocmpb)
-
- ld r8,16(r7)
- ld r10,16(r4)
- cmpb r12,r8,r0
- cmpb r11,r8,r10
- orc. r9,r12,r11
- bne cr0,L(different_nocmpb)
-
- ldu r8,24(r7)
- ldu r10,24(r4)
- cmpb r12,r8,r0
- cmpb r11,r8,r10
- orc. r9,r12,r11
- bne cr0,L(different_nocmpb)
-
- b L(loop_equal_align)
-
- /* A zero byte was found in r8 (s1 dword), r9 contains the cmpb
- result and r10 the dword from s2. To code isolate the byte
- up to end (including the '\0'), masking with 0xFF the remaining
- ones:
-
- #if __LITTLE_ENDIAN__
- (__builtin_ffsl (x) - 1) = counting trailing zero bits
- r9 = (__builtin_ffsl (r9) - 1) + 8;
- r9 = -1UL << r9
- #else
- r9 = __builtin_clzl (r9) + 8;
- r9 = -1UL >> r9
- #endif
- r8 = r8 | r9
- r10 = r10 | r9 */
-
-#ifdef __LITTLE_ENDIAN__
- nor r9,r9,r9
-L(different_nocmpb):
- neg r3,r9
- and r9,r9,r3
- cntlzd r9,r9
- subfic r9,r9,63
-#else
- not r9,r9
-L(different_nocmpb):
- cntlzd r9,r9
- subfic r9,r9,56
-#endif
- srd r3,r8,r9
- srd r10,r10,r9
- rldicl r10,r10,0,56
- rldicl r3,r3,0,56
- subf r3,r10,r3
- extsw r3,r3
- blr
-
- .align 4
-L(pagecross_check):
- subfic r9,r9,4096
- subfic r7,r7,4096
- cmpld cr7,r7,r9
- bge cr7,L(pagecross)
- mr r7,r9
-
- /* If unaligned 16 bytes reads across a 4K page boundary, it uses
- a simple byte a byte comparison until the page alignment for s1
- is reached. */
-L(pagecross):
- add r7,r3,r7
- subf r9,r3,r7
- mtctr r9
-
- .align 4
-L(pagecross_loop):
- /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2
- and if *s1 is '\0'. */
- lbz r9,0(r3)
- lbz r10,0(r4)
- addi r3,r3,1
- addi r4,r4,1
- cmplw cr7,r9,r10
- cmpdi cr5,r9,r0
- bne cr7,L(pagecross_ne)
- beq cr5,L(pagecross_nullfound)
- bdnz L(pagecross_loop)
- b L(align_8b)
-
- .align 4
- /* The unaligned read of source2 will cross a 4K page boundary,
- and the different byte or NULL maybe be in the remaining page
- bytes. Since it can not use the unaligned load, the algorithm
- reads and compares 8 bytes to keep source1 doubleword aligned. */
-L(check_source2_byte):
- li r9,8
- mtctr r9
-
- .align 4
-L(check_source2_byte_loop):
- lbz r9,0(r7)
- lbz r10,0(r4)
- addi r7,r7,1
- addi r4,r4,1
- cmplw cr7,r9,10
- cmpdi r5,r9,0
- bne cr7,L(pagecross_ne)
- beq cr5,L(pagecross_nullfound)
- bdnz L(check_source2_byte_loop)
-
- /* If source2 is unaligned to doubleword, the code needs to check
- on each interation if the unaligned doubleword access will cross
- a 4k page boundary. */
- .align 5
-L(loop_unaligned):
- ld r8,0(r7)
- ld r10,0(r4)
- cmpb r12,r8,r0
- cmpb r11,r8,r10
- orc. r9,r12,r11
- bne cr0,L(different_nocmpb)
- addi r7,r7,8
- addi r4,r4,8
-
-L(loop_diff_align):
- /* Check if [src2]+8 cross a 4k page boundary:
-
- srcin2 % PAGE_SIZE > (PAGE_SIZE - 8)
-
- with PAGE_SIZE being 4096. */
- rldicl r9,r4,0,52
- cmpldi cr7,r9,4088
- ble cr7,L(loop_unaligned)
- b L(check_source2_byte)
-
- .align 4
-L(pagecross_ne):
- extsw r3,r9
- mr r9,r10
-L(pagecross_retdiff):
- subf r9,r9,r3
- extsw r3,r9
- blr
-
- .align 4
-L(pagecross_nullfound):
- li r3,0
- b L(pagecross_retdiff)
-END (STRCMP)
-libc_hidden_builtin_def (strcmp)
diff --git a/sysdeps/powerpc/powerpc64/power8/strcpy.S b/sysdeps/powerpc/powerpc64/power8/strcpy.S
deleted file mode 100644
index 7f2cee4b1b..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/strcpy.S
+++ /dev/null
@@ -1,270 +0,0 @@
-/* Optimized strcpy/stpcpy implementation for PowerPC64/POWER8.
- Copyright (C) 2015-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#ifdef USE_AS_STPCPY
-# ifndef STPCPY
-# define FUNC_NAME __stpcpy
-# else
-# define FUNC_NAME STPCPY
-# endif
-#else
-# ifndef STRCPY
-# define FUNC_NAME strcpy
-# else
-# define FUNC_NAME STRCPY
-# endif
-#endif /* !USE_AS_STPCPY */
-
-/* Implements the function
-
- char * [r3] strcpy (char *dest [r3], const char *src [r4])
-
- or
-
- char * [r3] stpcpy (char *dest [r3], const char *src [r4])
-
- if USE_AS_STPCPY is defined.
-
- The implementation uses unaligned doubleword access to avoid specialized
- code paths depending of data alignment. Although recent powerpc64 uses
- 64K as default, the page cross handling assumes minimum page size of
- 4k. */
-
- .machine power7
-EALIGN (FUNC_NAME, 4, 0)
- li r0,0 /* Doubleword with null chars to use
- with cmpb. */
-
- /* Check if the [src]+15 will cross a 4K page by checking if the bit
- indicating the page size changes. Basically:
-
- uint64_t srcin = (uint64_t)src;
- uint64_t ob = srcin & 4096UL;
- uint64_t nb = (srcin+15UL) & 4096UL;
- if (ob ^ nb)
- goto pagecross; */
-
- addi r9,r4,15
- xor r9,r9,r4
- rlwinm. r9,r9,0,19,19
- bne L(pagecross)
-
- /* For short string (less than 16 bytes), just calculate its size as
- strlen and issues a memcpy if null is found. */
- mr r7,r4
- ld r12,0(r7) /* Load doubleword from memory. */
- cmpb r10,r12,r0 /* Check for null bytes in DWORD1. */
- cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
- bne cr7,L(done)
-
- ldu r8,8(r7)
- cmpb r10,r8,r0
- cmpdi cr7,r10,0
- bne cr7,L(done)
-
- b L(loop_before)
-
- .align 4
-L(pagecross):
- clrrdi r7,r4,3 /* Align the address to doubleword boundary. */
- rlwinm r6,r4,3,26,28 /* Calculate padding. */
- li r5,-1 /* MASK = 0xffffffffffffffff. */
- ld r12,0(r7) /* Load doubleword from memory. */
-#ifdef __LITTLE_ENDIAN__
- sld r5,r5,r6
-#else
- srd r5,r5,r6 /* MASK = MASK >> padding. */
-#endif
- orc r9,r12,r5 /* Mask bits that are not part of the string. */
- cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */
- cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
- bne cr7,L(done)
-
- ldu r6,8(r7)
- cmpb r10,r6,r0
- cmpdi cr7,r10,0
- bne cr7,L(done)
-
- ld r12,0(r7)
- cmpb r10,r12,r0
- cmpdi cr7,r10,0
- bne cr7,L(done)
-
- ldu r6,8(r7)
- cmpb r10,r6,r0
- cmpdi cr7,r10,0
- bne cr7,L(done)
-
- /* We checked for 24 - x bytes, with x being the source alignment
- (0 <= x <= 16), and no zero has been found. Start the loop
- copy with doubleword aligned address. */
- mr r7,r4
- ld r12, 0(r7)
- ldu r8, 8(r7)
-
-L(loop_before):
- /* Save the two doublewords readed from source and align the source
- to 16 bytes for the loop. */
- mr r11,r3
- std r12,0(r11)
- std r8,8(r11)
- addi r11,r11,16
- rldicl r9,r4,0,60
- subf r7,r9,r7
- subf r11,r9,r11
- b L(loop_start)
-
- .align 5
-L(loop):
- std r12, 0(r11)
- std r6, 8(r11)
- addi r11,r11,16
-L(loop_start):
- /* Load two doublewords, compare and merge in a
- single register for speed. This is an attempt
- to speed up the null-checking process for bigger strings. */
-
- ld r12, 8(r7)
- ldu r6, 16(r7)
- cmpb r10,r12,r0
- cmpb r9,r6,r0
- or r8,r9,r10 /* Merge everything in one doubleword. */
- cmpdi cr7,r8,0
- beq cr7,L(loop)
-
-
- /* OK, one (or both) of the doublewords contains a null byte. Check
- the first doubleword and decrement the address in case the first
- doubleword really contains a null byte. */
-
- addi r4,r7,-8
- cmpdi cr6,r10,0
- addi r7,r7,-8
- bne cr6,L(done2)
-
- /* The null byte must be in the second doubleword. Adjust the address
- again and move the result of cmpb to r10 so we can calculate the
- length. */
-
- mr r10,r9
- addi r7,r7,8
- b L(done2)
-
- /* r10 has the output of the cmpb instruction, that is, it contains
- 0xff in the same position as the null byte in the original
- doubleword from the string. Use that to calculate the length. */
-L(done):
- mr r11,r3
-L(done2):
-#ifdef __LITTLE_ENDIAN__
- addi r9, r10, -1 /* Form a mask from trailing zeros. */
- andc r9, r9, r10
- popcntd r6, r9 /* Count the bits in the mask. */
-#else
- cntlzd r6,r10 /* Count leading zeros before the match. */
-#endif
- subf r5,r4,r7
- srdi r6,r6,3 /* Convert leading/trailing zeros to bytes. */
- add r8,r5,r6 /* Compute final length. */
-#ifdef USE_AS_STPCPY
- /* stpcpy returns the dest address plus the size not counting the
- final '\0'. */
- add r3,r11,r8
-#endif
- addi r8,r8,1 /* Final '/0'. */
-
- cmpldi cr6,r8,8
- mtocrf 0x01,r8
- ble cr6,L(copy_LE_8)
-
- cmpldi cr1,r8,16
- blt cr1,8f
-
- /* Handle copies of 0~31 bytes. */
- .align 4
-L(copy_LT_32):
- /* At least 6 bytes to go. */
- blt cr1,8f
-
- /* Copy 16 bytes. */
- ld r6,0(r4)
- ld r8,8(r4)
- addi r4,r4,16
- std r6,0(r11)
- std r8,8(r11)
- addi r11,r11,16
-8: /* Copy 8 bytes. */
- bf 28,L(tail4)
- ld r6,0(r4)
- addi r4,r4,8
- std r6,0(r11)
- addi r11,r11,8
-
- .align 4
-/* Copies 4~7 bytes. */
-L(tail4):
- bf 29,L(tail2)
- lwz r6,0(r4)
- stw r6,0(r11)
- bf 30,L(tail5)
- lhz r7,4(r4)
- sth r7,4(r11)
- bflr 31
- lbz r8,6(r4)
- stb r8,6(r11)
- blr
-
- .align 4
-/* Copies 2~3 bytes. */
-L(tail2):
- bf 30,1f
- lhz r6,0(r4)
- sth r6,0(r11)
- bflr 31
- lbz r7,2(r4)
- stb r7,2(r11)
- blr
-
- .align 4
-L(tail5):
- bf 31,1f
- lbz r6,4(r4)
- stb r6,4(r11)
- blr
-
- .align 4
-1:
- bflr 31
- lbz r6,0(r4)
- stb r6,0(r11)
- blr
-
-/* Handles copies of 0~8 bytes. */
- .align 4
-L(copy_LE_8):
- bne cr6,L(tail4)
- ld r6,0(r4)
- std r6,0(r11)
- blr
-END (FUNC_NAME)
-
-#ifndef USE_AS_STPCPY
-libc_hidden_builtin_def (strcpy)
-#endif
diff --git a/sysdeps/powerpc/powerpc64/power8/strcspn.S b/sysdeps/powerpc/powerpc64/power8/strcspn.S
deleted file mode 100644
index c9a7a2e3c3..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/strcspn.S
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Optimized strcspn implementation for PowerPC64/POWER8.
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#define USE_AS_STRCSPN 1
-#include <sysdeps/powerpc/powerpc64/power8/strspn.S>
diff --git a/sysdeps/powerpc/powerpc64/power8/strlen.S b/sysdeps/powerpc/powerpc64/power8/strlen.S
deleted file mode 100644
index 8f4a1fc1dc..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/strlen.S
+++ /dev/null
@@ -1,301 +0,0 @@
-/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized
- loop.
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-/* TODO: change these to the actual instructions when the minimum required
- binutils allows it. */
-#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
-#define VBPERMQ(t,a,b) .long (0x1000054c \
- | ((t)<<(32-11)) \
- | ((a)<<(32-16)) \
- | ((b)<<(32-21)) )
-
-/* int [r3] strlen (char *s [r3]) */
-
-#ifndef STRLEN
-# define STRLEN strlen
-#endif
-
-/* TODO: change this to .machine power8 when the minimum required binutils
- allows it. */
- .machine power7
-EALIGN (STRLEN, 4, 0)
- CALL_MCOUNT 1
- dcbt 0,r3
- clrrdi r4,r3,3 /* Align the address to doubleword boundary. */
- rlwinm r6,r3,3,26,28 /* Calculate padding. */
- li r0,0 /* Doubleword with null chars to use
- with cmpb. */
- li r5,-1 /* MASK = 0xffffffffffffffff. */
- ld r12,0(r4) /* Load doubleword from memory. */
-#ifdef __LITTLE_ENDIAN__
- sld r5,r5,r6
-#else
- srd r5,r5,r6 /* MASK = MASK >> padding. */
-#endif
- orc r9,r12,r5 /* Mask bits that are not part of the string. */
- cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */
- cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
- bne cr7,L(done)
-
- /* For shorter strings (< 64 bytes), we will not use vector registers,
- as the overhead isn't worth it. So, let's use GPRs instead. This
- will be done the same way as we do in the POWER7 implementation.
- Let's see if we are aligned to a quadword boundary. If so, we can
- jump to the first (non-vectorized) loop. Otherwise, we have to
- handle the next DWORD first. */
- mtcrf 0x01,r4
- mr r9,r4
- addi r9,r9,8
- bt 28,L(align64)
-
- /* Handle the next 8 bytes so we are aligned to a quadword
- boundary. */
- ldu r5,8(r4)
- cmpb r10,r5,r0
- cmpdi cr7,r10,0
- addi r9,r9,8
- bne cr7,L(done)
-
-L(align64):
- /* Proceed to the old (POWER7) implementation, checking two doublewords
- per iteraction. For the first 56 bytes, we will just check for null
- characters. After that, we will also check if we are 64-byte aligned
- so we can jump to the vectorized implementation. We will unroll
- these loops to avoid excessive branching. */
- ld r6,8(r4)
- ldu r5,16(r4)
- cmpb r10,r6,r0
- cmpb r11,r5,r0
- or r5,r10,r11
- cmpdi cr7,r5,0
- addi r9,r9,16
- bne cr7,L(dword_zero)
-
- ld r6,8(r4)
- ldu r5,16(r4)
- cmpb r10,r6,r0
- cmpb r11,r5,r0
- or r5,r10,r11
- cmpdi cr7,r5,0
- addi r9,r9,16
- bne cr7,L(dword_zero)
-
- ld r6,8(r4)
- ldu r5,16(r4)
- cmpb r10,r6,r0
- cmpb r11,r5,r0
- or r5,r10,r11
- cmpdi cr7,r5,0
- addi r9,r9,16
- bne cr7,L(dword_zero)
-
- /* Are we 64-byte aligned? If so, jump to the vectorized loop.
- Note: aligning to 64-byte will necessarily slow down performance for
- strings around 64 bytes in length due to the extra comparisons
- required to check alignment for the vectorized loop. This is a
- necessary tradeoff we are willing to take in order to speed up the
- calculation for larger strings. */
- andi. r10,r9,63
- beq cr0,L(preloop)
- ld r6,8(r4)
- ldu r5,16(r4)
- cmpb r10,r6,r0
- cmpb r11,r5,r0
- or r5,r10,r11
- cmpdi cr7,r5,0
- addi r9,r9,16
- bne cr7,L(dword_zero)
-
- andi. r10,r9,63
- beq cr0,L(preloop)
- ld r6,8(r4)
- ldu r5,16(r4)
- cmpb r10,r6,r0
- cmpb r11,r5,r0
- or r5,r10,r11
- cmpdi cr7,r5,0
- addi r9,r9,16
- bne cr7,L(dword_zero)
-
- andi. r10,r9,63
- beq cr0,L(preloop)
- ld r6,8(r4)
- ldu r5,16(r4)
- cmpb r10,r6,r0
- cmpb r11,r5,r0
- or r5,r10,r11
- cmpdi cr7,r5,0
- addi r9,r9,16
- bne cr7,L(dword_zero)
-
- andi. r10,r9,63
- beq cr0,L(preloop)
- ld r6,8(r4)
- ldu r5,16(r4)
- cmpb r10,r6,r0
- cmpb r11,r5,r0
- or r5,r10,r11
- cmpdi cr7,r5,0
- addi r9,r9,16
-
- /* At this point, we are necessarily 64-byte aligned. If no zeroes were
- found, jump to the vectorized loop. */
- beq cr7,L(preloop)
-
-L(dword_zero):
- /* OK, one (or both) of the doublewords contains a null byte. Check
- the first doubleword and decrement the address in case the first
- doubleword really contains a null byte. */
-
- cmpdi cr6,r10,0
- addi r4,r4,-8
- bne cr6,L(done)
-
- /* The null byte must be in the second doubleword. Adjust the address
- again and move the result of cmpb to r10 so we can calculate the
- length. */
-
- mr r10,r11
- addi r4,r4,8
-
- /* If the null byte was found in the non-vectorized code, compute the
- final length. r10 has the output of the cmpb instruction, that is,
- it contains 0xff in the same position as the null byte in the
- original doubleword from the string. Use that to calculate the
- length. */
-L(done):
-#ifdef __LITTLE_ENDIAN__
- addi r9, r10,-1 /* Form a mask from trailing zeros. */
- andc r9, r9,r10
- popcntd r0, r9 /* Count the bits in the mask. */
-#else
- cntlzd r0,r10 /* Count leading zeros before the match. */
-#endif
- subf r5,r3,r4
- srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
- add r3,r5,r0 /* Compute final length. */
- blr
-
- /* Vectorized implementation starts here. */
- .p2align 4
-L(preloop):
- /* Set up for the loop. */
- mr r4,r9
- li r7, 16 /* Load required offsets. */
- li r8, 32
- li r9, 48
- li r12, 8
- vxor v0,v0,v0 /* VR with null chars to use with
- vcmpequb. */
-
- /* Main loop to look for the end of the string. We will read in
- 64-byte chunks. Align it to 32 bytes and unroll it 3 times to
- leverage the icache performance. */
- .p2align 5
-L(loop):
- lvx v1,r4,r0 /* Load 4 quadwords. */
- lvx v2,r4,r7
- lvx v3,r4,r8
- lvx v4,r4,r9
- vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
- vminub v6,v3,v4
- vminub v7,v5,v6
- vcmpequb. v7,v7,v0 /* Check for NULLs. */
- addi r4,r4,64 /* Adjust address for the next iteration. */
- bne cr6,L(vmx_zero)
-
- lvx v1,r4,r0 /* Load 4 quadwords. */
- lvx v2,r4,r7
- lvx v3,r4,r8
- lvx v4,r4,r9
- vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
- vminub v6,v3,v4
- vminub v7,v5,v6
- vcmpequb. v7,v7,v0 /* Check for NULLs. */
- addi r4,r4,64 /* Adjust address for the next iteration. */
- bne cr6,L(vmx_zero)
-
- lvx v1,r4,r0 /* Load 4 quadwords. */
- lvx v2,r4,r7
- lvx v3,r4,r8
- lvx v4,r4,r9
- vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
- vminub v6,v3,v4
- vminub v7,v5,v6
- vcmpequb. v7,v7,v0 /* Check for NULLs. */
- addi r4,r4,64 /* Adjust address for the next iteration. */
- beq cr6,L(loop)
-
-L(vmx_zero):
- /* OK, we found a null byte. Let's look for it in the current 64-byte
- block and mark it in its corresponding VR. */
- vcmpequb v1,v1,v0
- vcmpequb v2,v2,v0
- vcmpequb v3,v3,v0
- vcmpequb v4,v4,v0
-
- /* We will now 'compress' the result into a single doubleword, so it
- can be moved to a GPR for the final calculation. First, we
- generate an appropriate mask for vbpermq, so we can permute bits into
- the first halfword. */
- vspltisb v10,3
- lvsl v11,r0,r0
- vslb v10,v11,v10
-
- /* Permute the first bit of each byte into bits 48-63. */
- VBPERMQ(v1,v1,v10)
- VBPERMQ(v2,v2,v10)
- VBPERMQ(v3,v3,v10)
- VBPERMQ(v4,v4,v10)
-
- /* Shift each component into its correct position for merging. */
-#ifdef __LITTLE_ENDIAN__
- vsldoi v2,v2,v2,2
- vsldoi v3,v3,v3,4
- vsldoi v4,v4,v4,6
-#else
- vsldoi v1,v1,v1,6
- vsldoi v2,v2,v2,4
- vsldoi v3,v3,v3,2
-#endif
-
- /* Merge the results and move to a GPR. */
- vor v1,v2,v1
- vor v2,v3,v4
- vor v4,v1,v2
- MFVRD(r10,v4)
-
- /* Adjust address to the begninning of the current 64-byte block. */
- addi r4,r4,-64
-
-#ifdef __LITTLE_ENDIAN__
- addi r9, r10,-1 /* Form a mask from trailing zeros. */
- andc r9, r9,r10
- popcntd r0, r9 /* Count the bits in the mask. */
-#else
- cntlzd r0,r10 /* Count leading zeros before the match. */
-#endif
- subf r5,r3,r4
- add r3,r5,r0 /* Compute final length. */
- blr
-
-END (STRLEN)
-libc_hidden_builtin_def (strlen)
diff --git a/sysdeps/powerpc/powerpc64/power8/strncase.S b/sysdeps/powerpc/powerpc64/power8/strncase.S
deleted file mode 100644
index 32e09e4d94..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/strncase.S
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Optimized strncasecmp implementation for POWER8.
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#define USE_AS_STRNCASECMP 1
-#include <sysdeps/powerpc/powerpc64/power8/strcasecmp.S>
diff --git a/sysdeps/powerpc/powerpc64/power8/strncmp.S b/sysdeps/powerpc/powerpc64/power8/strncmp.S
deleted file mode 100644
index 3d8df90538..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/strncmp.S
+++ /dev/null
@@ -1,327 +0,0 @@
-/* Optimized strncmp implementation for PowerPC64/POWER8.
- Copyright (C) 2015-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#ifndef STRNCMP
-# define STRNCMP strncmp
-#endif
-
-/* Implements the function
-
- int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n)
-
- The implementation uses unaligned doubleword access to avoid specialized
- code paths depending of data alignment. Although recent powerpc64 uses
- 64K as default, the page cross handling assumes minimum page size of
- 4k. */
-
- .machine power7
-EALIGN (STRNCMP, 4, 0)
- /* Check if size is 0. */
- mr. r10,r5
- beq cr0,L(ret0)
-
- /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
- the code:
-
- (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
-
- with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */
- rldicl r8,r3,0,52
- cmpldi cr7,r8,4096-16
- bgt cr7,L(pagecross)
- rldicl r9,r4,0,52
- cmpldi cr7,r9,4096-16
- bgt cr7,L(pagecross)
-
- /* For short string up to 16 bytes, load both s1 and s2 using
- unaligned dwords and compare. */
- ld r7,0(r3)
- ld r9,0(r4)
- li r8,0
- cmpb r8,r7,r8
- cmpb r6,r7,r9
- orc. r8,r8,r6
- bne cr0,L(different1)
-
- /* If the string compared are equal, but size is less or equal
- to 8, return 0. */
- cmpldi cr7,r10,8
- li r9,0
- ble cr7,L(ret1)
- addi r5,r10,-8
-
- ld r7,8(r3)
- ld r9,8(r4)
- cmpb r8,r7,r8
- cmpb r6,r7,r9
- orc. r8,r8,r6
- bne cr0,L(different0)
-
- cmpldi cr7,r5,8
- mr r9,r8
- ble cr7,L(ret1)
-
- /* Update pointers and size. */
- addi r10,r10,-16
- addi r3,r3,16
- addi r4,r4,16
-
- /* Now it has checked for first 16 bytes, align source1 to doubleword
- and adjust source2 address. */
-L(align_8b):
- rldicl r5,r3,0,61
- rldicr r3,r3,0,60
- subf r4,r5,r4
- add r10,r10,r5
-
- /* At this point, source1 alignment is 0 and source2 alignment is
- between 0 and 7. Check is source2 alignment is 0, meaning both
- sources have the same alignment. */
- andi. r8,r4,0x7
- beq cr0,L(loop_eq_align_0)
-
- li r5,0
- b L(loop_ne_align_1)
-
- /* If source2 is unaligned to doubleword, the code needs to check
- on each interation if the unaligned doubleword access will cross
- a 4k page boundary. */
- .align 4
-L(loop_ne_align_0):
- ld r7,0(r3)
- ld r9,0(r4)
- cmpb r8,r7,r5
- cmpb r6,r7,r9
- orc. r8,r8,r6
- bne cr0,L(different1)
-
- cmpldi cr7,r10,8
- ble cr7,L(ret0)
- addi r10,r10,-8
- addi r3,r3,8
- addi r4,r4,8
-L(loop_ne_align_1):
- rldicl r9,r4,0,52
- cmpldi r7,r9,4088
- ble cr7,L(loop_ne_align_0)
- cmpdi cr7,r10,0
- beq cr7,L(ret0)
-
- lbz r9,0(r3)
- lbz r8,0(r4)
- cmplw cr7,r9,r8
- bne cr7,L(byte_ne_4)
- cmpdi cr7,r9,0
- beq cr7,L(size_reached_0)
-
- li r9,r7
- addi r8,r3,1
- mtctr r9
- addi r4,r4,1
- addi r10,r10,-1
- addi r3,r3,8
-
- /* The unaligned read of source2 will cross a 4K page boundary,
- and the different byte or NULL maybe be in the remaining page
- bytes. Since it can not use the unaligned load the algorithm
- reads and compares 8 bytes to keep source1 doubleword aligned. */
- .align 4
-L(loop_ne_align_byte):
- cmpdi cr7,r10,0
- addi r10,r10,-1
- beq cr7,L(ret0)
- lbz r9,0(r8)
- lbz r7,0(r4)
- addi r8,r8,1
- addi r4,r4,1
- cmplw cr7,r9,r7
- cmpdi cr5,r9,0
- bne cr7,L(size_reached_2)
- beq cr5,L(size_reached_0)
- bdnz L(loop_ne_align_byte)
-
- cmpdi cr7,r10,0
- bne+ cr7,L(loop_ne_align_0)
-
- .align 4
-L(ret0):
- li r9,0
-L(ret1):
- mr r3,r9
- blr
-
- /* The code now check if r8 and r10 are different by issuing a
- cmpb and shift the result based on its output:
-
- #ifdef __LITTLE_ENDIAN__
- leadzero = (__builtin_ffsl (z1) - 1);
- leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
- r1 = (r1 >> leadzero) & 0xFFUL;
- r2 = (r2 >> leadzero) & 0xFFUL;
- #else
- leadzero = __builtin_clzl (z1);
- leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
- r1 = (r1 >> (56 - leadzero)) & 0xFFUL;
- r2 = (r2 >> (56 - leadzero)) & 0xFFUL;
- #endif
- return r1 - r2; */
-
- .align 4
-L(different0):
- mr r10,r5
-#ifdef __LITTLE_ENDIAN__
-L(different1):
- neg r11,r8
- sldi r10,r10,3
- and r8,r11,r8
- addi r10,r10,-8
- cntlzd r8,r8
- subfic r8,r8,63
- extsw r8,r8
- cmpld cr7,r8,r10
- ble cr7,L(different2)
- mr r8,r10
-L(different2):
- extsw r8,r8
-#else
-L(different1):
- addi r10,r10,-1
- cntlzd r8,r8
- sldi r10,r10,3
- cmpld cr7,r8,r10
- blt cr7,L(different2)
- mr r8,r10
-L(different2):
- subfic r8,r8,56
-#endif
- srd r7,r7,r8
- srd r9,r9,r8
- rldicl r3,r7,0,56
- rldicl r9,r9,0,56
- subf r9,r9,3
- extsw r9,r9
- mr r3,r9
- blr
-
- /* If unaligned 16 bytes reads across a 4K page boundary, it uses
- a simple byte a byte comparison until the page alignment for s1
- is reached. */
- .align 4
-L(pagecross):
- lbz r7,0(r3)
- lbz r9,0(r4)
- subfic r8,r8,4095
- cmplw cr7,r9,r7
- bne cr7,L(byte_ne_3)
- cmpdi cr7,r9,0
- beq cr7,L(byte_ne_0)
- addi r10,r10,-1
- subf r7,r8,r10
- subf r9,r7,r10
- addi r9,r9,1
- mtctr r9
- b L(pagecross_loop1)
-
- .align 4
-L(pagecross_loop0):
- beq cr7,L(ret0)
- lbz r9,0(r3)
- lbz r8,0(r4)
- addi r10,r10,-1
- cmplw cr7,r9,r8
- cmpdi cr5,r9,0
- bne r7,L(byte_ne_2)
- beq r5,L(byte_ne_0)
-L(pagecross_loop1):
- cmpdi cr7,r10,0
- addi r3,r3,1
- addi r4,r4,1
- bdnz L(pagecross_loop0)
- cmpdi cr7,r7,0
- li r9,0
- bne+ cr7,L(align_8b)
- b L(ret1)
-
- /* If both source1 and source2 are doubleword aligned, there is no
- need for page boundary cross checks. */
- .align 4
-L(loop_eq_align_0):
- ld r7,0(r3)
- ld r9,0(r4)
- cmpb r8,r7,r8
- cmpb r6,r7,r9
- orc. r8,r8,r6
- bne cr0,L(different1)
-
- cmpldi cr7,r10,8
- ble cr7,L(ret0)
- addi r9,r10,-9
-
- li r5,0
- srdi r9,r9,3
- addi r9,r9,1
- mtctr r9
- b L(loop_eq_align_2)
-
- .align 4
-L(loop_eq_align_1):
- bdz L(ret0)
-L(loop_eq_align_2):
- ldu r7,8(r3)
- addi r10,r10,-8
- ldu r9,8(r4)
- cmpb r8,r7,r5
- cmpb r6,r7,r9
- orc. r8,r8,r6
- beq cr0,L(loop_eq_align_1)
- b L(different1)
-
- .align 4
-L(byte_ne_0):
- li r7,0
-L(byte_ne_1):
- subf r9,r9,r7
- extsw r9,r9
- b L(ret1)
-
- .align 4
-L(byte_ne_2):
- extsw r7,r9
- mr r9,r8
- b L(byte_ne_1)
-L(size_reached_0):
- li r10,0
-L(size_reached_1):
- subf r9,r9,r10
- extsw r9,r9
- b L(ret1)
-L(size_reached_2):
- extsw r10,r9
- mr r9,r7
- b L(size_reached_1)
-L(byte_ne_3):
- extsw r7,r7
- b L(byte_ne_1)
-L(byte_ne_4):
- extsw r10,r9
- mr r9,r8
- b L(size_reached_1)
-END(STRNCMP)
-libc_hidden_builtin_def(strncmp)
diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
deleted file mode 100644
index 6d40f30ff7..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/strncpy.S
+++ /dev/null
@@ -1,465 +0,0 @@
-/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8.
- Copyright (C) 2015-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#ifdef USE_AS_STPNCPY
-# ifndef STPNCPY
-# define FUNC_NAME __stpncpy
-# else
-# define FUNC_NAME STPNCPY
-# endif
-#else
-# ifndef STRNCPY
-# define FUNC_NAME strncpy
-# else
-# define FUNC_NAME STRNCPY
-# endif
-#endif /* !USE_AS_STPNCPY */
-
-#ifndef MEMSET
-/* For builds without IFUNC support, local calls should be made to internal
- GLIBC symbol (created by libc_hidden_builtin_def). */
-# ifdef SHARED
-# define MEMSET __GI_memset
-# else
-# define MEMSET memset
-# endif
-#endif
-
-#define FRAMESIZE (FRAME_MIN_SIZE+48)
-
-/* Implements the function
-
- char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
-
- or
-
- char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
-
- if USE_AS_STPCPY is defined.
-
- The implementation uses unaligned doubleword access to avoid specialized
- code paths depending of data alignment. Although recent powerpc64 uses
- 64K as default, the page cross handling assumes minimum page size of
- 4k. */
-
- .machine power7
-EALIGN (FUNC_NAME, 4, 0)
-
- /* Check if the [src]+15 will cross a 4K page by checking if the bit
- indicating the page size changes. Basically:
-
- uint64_t srcin = (uint64_t)src;
- uint64_t ob = srcin & 4096UL;
- uint64_t nb = (srcin+15UL) & 4096UL;
- if (ob ^ nb)
- goto pagecross; */
-
- addi r10,r4,16
- rlwinm r9,r4,0,19,19
-
- /* Save some non-volatile registers on the stack. */
- std r26,-48(r1)
- std r27,-40(r1)
-
- rlwinm r8,r10,0,19,19
-
- std r28,-32(r1)
- std r29,-24(r1)
-
- cmpld cr7,r9,r8
-
- std r30,-16(r1)
- std r31,-8(r1)
-
- /* Update CFI. */
- cfi_offset(r26, -48)
- cfi_offset(r27, -40)
- cfi_offset(r28, -32)
- cfi_offset(r29, -24)
- cfi_offset(r30, -16)
- cfi_offset(r31, -8)
-
- beq cr7,L(unaligned_lt_16)
- rldicl r9,r4,0,61
- subfic r8,r9,8
- cmpld cr7,r5,r8
- bgt cr7,L(pagecross)
-
- /* At this points there is 1 to 15 bytes to check and write. Since it could
- be either from first unaligned 16 bytes access or from bulk copy, the code
- uses an unrolled byte read/write instead of trying to analyze the cmpb
- results. */
-L(short_path):
- mr r9,r3
-L(short_path_1):
- /* Return if there are no more bytes to be written. */
- cmpdi cr7,r5,0
- beq cr7,L(short_path_loop_end_1)
-L(short_path_2):
- /* Copy one char from src (r4) and write it to dest (r9). If it is the
- end-of-string, start the null padding. Continue, otherwise. */
- lbz r10,0(r4)
- cmpdi cr7,r10,0
- stb r10,0(r9)
- beq cr7,L(zero_pad_start_1)
- /* If there are no more bytes to be written, return. */
- cmpdi cr0,r5,1
- addi r8,r9,1
- addi r6,r5,-1
- beq cr0,L(short_path_loop_end_0)
- /* Copy another char from src (r4) to dest (r9). Check again if it is
- the end-of-string. If so, start the null padding. */
- lbz r10,1(r4)
- cmpdi cr7,r10,0
- stb r10,1(r9)
- beq cr7,L(zero_pad_start_prepare_1)
- /* Eagerly decrement r5 by 3, which is the number of bytes already
- written, plus one write that will be performed later on. */
- addi r10,r5,-3
- b L(short_path_loop_1)
-
- .align 4
-L(short_path_loop):
- /* At this point, the induction variable, r5, as well as the pointers
- to dest and src (r9 and r4, respectivelly) have been updated.
-
- Note: The registers r7 and r10 are induction variables derived from
- r5. They are used to determine if the total number of writes has
- been reached at every other write.
-
- Copy one char from src (r4) and write it to dest (r9). If it is the
- end-of-string, start the null padding. Continue, otherwise. */
- lbz r8,0(r4)
- addi r7,r10,-2
- cmpdi cr5,r8,0
- stb r8,0(r9)
- beq cr5,L(zero_pad_start_1)
- beq cr7,L(short_path_loop_end_0)
- /* Copy another char from src (r4) to dest (r9). Check again if it is
- the end-of-string. If so, start the null padding. */
- lbz r8,1(r4)
- cmpdi cr7,r8,0
- stb r8,1(r9)
- beq cr7,L(zero_pad_start)
- mr r10,r7
-L(short_path_loop_1):
- /* This block is reached after two chars have been already written to
- dest. Nevertheless, r5 (the induction variable), r9 (the pointer to
- dest), and r4 (the pointer to src) have not yet been updated.
-
- At this point:
- r5 holds the count of bytes yet to be written plus 2.
- r9 points to the last two chars that were already written to dest.
- r4 points to the last two chars that were already copied from src.
-
- The algorithm continues by decrementing r5, the induction variable,
- so that it reflects the last two writes. The pointers to dest (r9)
- and to src (r4) are increment by two, for the same reason.
-
- Note: Register r10 is another induction variable, derived from r5,
- which determines if the total number of writes has been reached. */
- addic. r5,r5,-2
- addi r9,r9,2
- cmpdi cr7,r10,0 /* Eagerly check if the next write is the last. */
- addi r4,r4,2
- addi r6,r9,1
- bne cr0,L(short_path_loop) /* Check if the total number of writes
- has been reached at every other
- write. */
-#ifdef USE_AS_STPNCPY
- mr r3,r9
- b L(short_path_loop_end)
-#endif
-
-L(short_path_loop_end_0):
-#ifdef USE_AS_STPNCPY
- addi r3,r9,1
- b L(short_path_loop_end)
-#endif
-L(short_path_loop_end_1):
-#ifdef USE_AS_STPNCPY
- mr r3,r9
-#endif
-L(short_path_loop_end):
- /* Restore non-volatile registers. */
- ld r26,-48(r1)
- ld r27,-40(r1)
- ld r28,-32(r1)
- ld r29,-24(r1)
- ld r30,-16(r1)
- ld r31,-8(r1)
- blr
-
- /* This code pads the remainder of dest with NULL bytes. The algorithm
- calculates the remaining size and calls memset. */
- .align 4
-L(zero_pad_start):
- mr r5,r10
- mr r9,r6
-L(zero_pad_start_1):
- /* At this point:
- - r5 holds the number of bytes that still have to be written to
- dest.
- - r9 points to the position, in dest, where the first null byte
- will be written.
- The above statements are true both when control reaches this label
- from a branch or when falling through the previous lines. */
-#ifndef USE_AS_STPNCPY
- mr r30,r3 /* Save the return value of strncpy. */
-#endif
- /* Prepare the call to memset. */
- mr r3,r9 /* Pointer to the area to be zero-filled. */
- li r4,0 /* Byte to be written (zero). */
-
- /* We delayed the creation of the stack frame, as well as the saving of
- the link register, because only at this point, we are sure that
- doing so is actually needed. */
-
- /* Save the link register. */
- mflr r0
- std r0,16(r1)
- cfi_offset(lr, 16)
-
- /* Create the stack frame. */
- stdu r1,-FRAMESIZE(r1)
- cfi_adjust_cfa_offset(FRAMESIZE)
-
- bl MEMSET
- nop
-
- /* Restore the stack frame. */
- addi r1,r1,FRAMESIZE
- cfi_adjust_cfa_offset(-FRAMESIZE)
- /* Restore the link register. */
- ld r0,16(r1)
- mtlr r0
-
-#ifndef USE_AS_STPNCPY
- mr r3,r30 /* Restore the return value of strncpy, i.e.:
- dest. For stpncpy, the return value is the
- same as return value of memset. */
-#endif
-
- /* Restore non-volatile registers and return. */
- ld r26,-48(r1)
- ld r27,-40(r1)
- ld r28,-32(r1)
- ld r29,-24(r1)
- ld r30,-16(r1)
- ld r31,-8(r1)
- blr
-
- /* The common case where [src]+16 will not cross a 4K page boundary.
- In this case the code fast check the first 16 bytes by using doubleword
- read/compares and update destiny if neither total size or null byte
- is found in destiny. */
- .align 4
-L(unaligned_lt_16):
- cmpldi cr7,r5,7
- ble cr7,L(short_path)
- ld r7,0(r4)
- li r8,0
- cmpb r8,r7,r8
- cmpdi cr7,r8,0
- bne cr7,L(short_path_prepare_2)
- addi r6,r5,-8
- std r7,0(r3)
- addi r9,r3,8
- cmpldi cr7,r6,7
- addi r7,r4,8
- ble cr7,L(short_path_prepare_1_1)
- ld r4,8(r4)
- cmpb r8,r4,r8
- cmpdi cr7,r8,0
- bne cr7,L(short_path_prepare_2_1)
- std r4,8(r3)
- addi r29,r3,16
- addi r5,r5,-16
- /* Neither the null byte was found or total length was reached,
- align to 16 bytes and issue a bulk copy/compare. */
- b L(align_to_16b)
-
- /* In the case of 4k page boundary cross, the algorithm first align
- the address to a doubleword, calculate a mask based on alignment
- to ignore the bytes and continue using doubleword. */
- .align 4
-L(pagecross):
- rldicr r11,r4,0,59 /* Align the address to 8 bytes boundary. */
- li r6,-1 /* MASK = 0xffffffffffffffffUL. */
- sldi r9,r9,3 /* Calculate padding. */
- ld r7,0(r11) /* Load doubleword from memory. */
-#ifdef __LITTLE_ENDIAN__
- sld r9,r6,r9 /* MASK = MASK << padding. */
-#else
- srd r9,r6,r9 /* MASK = MASK >> padding. */
-#endif
- orc r9,r7,r9 /* Mask bits that are not part of the
- string. */
- li r7,0
- cmpb r9,r9,r7 /* Check for null bytes in DWORD1. */
- cmpdi cr7,r9,0
- bne cr7,L(short_path_prepare_2)
- subf r8,r8,r5 /* Adjust total length. */
- cmpldi cr7,r8,8 /* Check if length was reached. */
- ble cr7,L(short_path_prepare_2)
-
- /* For next checks we have aligned address, so we check for more
- three doublewords to make sure we can read 16 unaligned bytes
- to start the bulk copy with 16 aligned addresses. */
- ld r7,8(r11)
- cmpb r9,r7,r9
- cmpdi cr7,r9,0
- bne cr7,L(short_path_prepare_2)
- addi r7,r8,-8
- cmpldi cr7,r7,8
- ble cr7,L(short_path_prepare_2)
- ld r7,16(r11)
- cmpb r9,r7,r9
- cmpdi cr7,r9,0
- bne cr7,L(short_path_prepare_2)
- addi r8,r8,-16
- cmpldi cr7,r8,8
- ble cr7,L(short_path_prepare_2)
- ld r8,24(r11)
- cmpb r9,r8,r9
- cmpdi cr7,r9,0
- bne cr7,L(short_path_prepare_2)
-
- /* No null byte found in the 32 bytes readed and length not reached,
- read source again using unaligned loads and store them. */
- ld r9,0(r4)
- addi r29,r3,16
- addi r5,r5,-16
- std r9,0(r3)
- ld r9,8(r4)
- std r9,8(r3)
-
- /* Align source to 16 bytes and adjust destiny and size. */
-L(align_to_16b):
- rldicl r9,r10,0,60
- rldicr r28,r10,0,59
- add r12,r5,r9
- subf r29,r9,r29
-
- /* The bulk read/compare/copy loads two doublewords, compare and merge
- in a single register for speed. This is an attempt to speed up the
- null-checking process for bigger strings. */
-
- cmpldi cr7,r12,15
- ble cr7,L(short_path_prepare_1_2)
-
- /* Main loop for large sizes, unrolled 2 times to get better use of
- pipeline. */
- ld r8,0(28)
- ld r10,8(28)
- li r9,0
- cmpb r7,r8,r9
- cmpb r9,r10,r9
- or. r6,r9,r7
- bne cr0,L(short_path_prepare_2_3)
- addi r5,r12,-16
- addi r4,r28,16
- std r8,0(r29)
- std r10,8(r29)
- cmpldi cr7,r5,15
- addi r9,r29,16
- ble cr7,L(short_path_1)
- mr r11,r28
- mr r6,r29
- li r30,0
- subfic r26,r4,48
- subfic r27,r9,48
-
- b L(loop_16b)
-
- .align 4
-L(loop_start):
- ld r31,0(r11)
- ld r10,8(r11)
- cmpb r0,r31,r7
- cmpb r8,r10,r7
- or. r7,r0,r8
- addi r5,r5,-32
- cmpldi cr7,r5,15
- add r4,r4,r26
- add r9,r9,r27
- bne cr0,L(short_path_prepare_2_2)
- add r4,r28,r4
- std r31,0(r6)
- add r9,r29,r9
- std r10,8(r6)
- ble cr7,L(short_path_1)
-
-L(loop_16b):
- ld r10,16(r11)
- ld r0,24(r11)
- cmpb r8,r10,r30
- cmpb r7,r0,r30
- or. r7,r8,r7
- addi r12,r12,-32
- cmpldi cr7,r12,15
- addi r11,r11,32
- bne cr0,L(short_path_2)
- std r10,16(r6)
- addi r6,r6,32
- std r0,-8(r6)
- bgt cr7,L(loop_start)
-
- mr r5,r12
- mr r4,r11
- mr r9,r6
- b L(short_path_1)
-
- .align 4
-L(short_path_prepare_1_1):
- mr r5,r6
- mr r4,r7
- b L(short_path_1)
-L(short_path_prepare_1_2):
- mr r5,r12
- mr r4,r28
- mr r9,r29
- b L(short_path_1)
-L(short_path_prepare_2):
- mr r9,r3
- b L(short_path_2)
-L(short_path_prepare_2_1):
- mr r5,r6
- mr r4,r7
- b L(short_path_2)
-L(short_path_prepare_2_2):
- mr r5,r12
- mr r4,r11
- mr r9,r6
- b L(short_path_2)
-L(short_path_prepare_2_3):
- mr r5,r12
- mr r4,r28
- mr r9,r29
- b L(short_path_2)
-L(zero_pad_start_prepare_1):
- mr r5,r6
- mr r9,r8
- b L(zero_pad_start_1)
-END (FUNC_NAME)
-
-#ifndef USE_AS_STPNCPY
-libc_hidden_builtin_def (strncpy)
-#endif
diff --git a/sysdeps/powerpc/powerpc64/power8/strnlen.S b/sysdeps/powerpc/powerpc64/power8/strnlen.S
deleted file mode 100644
index 3eadbfb09e..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/strnlen.S
+++ /dev/null
@@ -1,433 +0,0 @@
-/* Optimized strnlen implementation for POWER8 using a vmx loop.
-
- Copyright (C) 2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* It is implemented the following heuristic:
- 1. Case maxlen <= 32: align the pointer to 8 bytes to loop through
- reading doublewords. Uses the POWER7 algorithm.
- 2. Case maxlen > 32: check for null bytes in the first 16 bytes using
- unaligned accesses. Return length if found. Otherwise:
- 2.1 Case maxlen < 64: deduct the bytes previously read, align
- the pointer to 16 bytes and loop through reading quadwords
- until find null bytes or reach maxlen.
- 2.2 Case maxlen > 64: deduct the bytes previously read, align
- the pointer to 64 bytes and set up a counter to loop through
- reading in strides of 64 bytes. In case it finished the loop
- with null bytes not found, process the remainder bytes by
- switching to the loop to heuristic in 2.1. */
-
-#include <sysdep.h>
-
-/* Define default page size to 4KB. */
-#define PAGE_SIZE 4096
-
-/* The following macros implement Power ISA v2.07 opcodes
- that could not be used directly into this code to the keep
- compatibility with older binutils versions. */
-
-/* Move from vector register doubleword. */
-#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
-
-/* Move to vector register doubleword. */
-#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
-
-/* Vector Bit Permute Quadword. */
-#define VBPERMQ(t,a,b) .long (0x1000054c \
- | ((t)<<(32-11)) \
- | ((a)<<(32-16)) \
- | ((b)<<(32-21)) )
-
-/* Vector Population Count Halfword. */
-#define VPOPCNTH(t,b) .long (0x10000743 | ((t)<<(32-11)) | ((b)<<(32-21)))
-
-/* Vector Count Leading Zeros Halfword. */
-#define VCLZH(t,b) .long (0x10000742 | ((t)<<(32-11)) | ((b)<<(32-21)))
-
-
-/* int [r3] strnlen (char *s [r3], size_t maxlen [r4]) */
-/* TODO: change to power8 when minimum required binutils allows it. */
- .machine power7
-ENTRY (__strnlen)
- CALL_MCOUNT 2
- dcbt 0,r3
-
- cmpldi r4,32 /* Check if maxlen <= 32. */
- ble L(small_range) /* If maxlen <= 32. */
-
- /* Upcoming 16 bytes unaligned accesses cannot cross the page boundary
- otherwise the processor throws an memory access error.
- Use following code to check there is room for such as accesses:
- (((size_t) s) % PAGE_SIZE > (PAGE_SIZE - 16)
- If it is disallowed then switch to the code that handles
- the string when maxlen <= 32. */
- clrldi r10,r3,52
- cmpldi cr7,r10,PAGE_SIZE-16
- bgt cr7,L(small_range) /* If less than 16B of page end. */
-
- /* Compute our permute constant r8. */
- li r7,0
- /* Compute a bpermd constant to move bit 0 of each word into
- a halfword value, and count trailing zeros. */
-#ifdef __LITTLE_ENDIAN__
- li r8,0x2820
- oris r8,r8,0x3830
- sldi r8,r8,32
- ori r8,r8,0x0800
- oris r8,r8,0x1810
-#else
- li r8,0x1018
- oris r8,r8,0x0008
- sldi r8,r8,32
- ori r8,r8,0x3038
- oris r8,r8,0x2028
-#endif
-
- /* maxlen > 32. Optimistically check for null bytes in the first
- 16 bytes of the string using unaligned accesses. */
- ld r5,0(r3)
- ld r6,8(r3)
- cmpb r10,r7,r5 /* Check for null bytes in DWORD1. */
- cmpb r11,r7,r6 /* Check for null bytes in DWORD2. */
- or. r7,r10,r11
- bne cr0, L(early_find) /* If found null bytes. */
-
- /* At this point maxlen > 32 and null bytes were not found at first
- 16 bytes. Prepare for loop using VMX. */
-
- /* r3 == s, r4 == maxlen. All other volatile regs are unused now. */
-
- addi r5,r3,16 /* Align up, or just add the 16B we
- already checked. */
- li r0,15
- and r7,r5,r0 /* Find offset into 16B alignment. */
- andc r5,r5,r0 /* Quadword align up s to the next quadword. */
- li r0,16
- subf r0,r7,r0
- subf r4,r0,r4 /* Deduct unaligned bytes from maxlen. */
-
-
- /* Compute offsets for vmx loads, and precompute the vbpermq
- constants for both the 64B and 16B loops. */
- li r6,0
- vspltisb v0,0
- vspltisb v10,3
- lvsl v11,r6,r6
- vslb v10,v11,v10
-
- cmpldi r4,64 /* Check maxlen < 64. */
- blt L(smaller) /* If maxlen < 64 */
-
- /* In order to begin the 64B loop, it needs to be 64
- bytes aligned. So read quadwords until it is aligned or found null
- bytes. At worst case it will be aligned after the fourth iteration,
- so unroll the loop to avoid counter checking. */
- andi. r7,r5,63 /* Check if is 64 bytes aligned. */
- beq cr0,L(preloop_64B) /* If it is already 64B aligned. */
- lvx v1,r5,r6
- vcmpequb. v1,v1,v0
- addi r5,r5,16
- addi r4,r4,-16 /* Decrement maxlen in 16 bytes. */
- bne cr6,L(found_aligning64B) /* If found null bytes. */
-
- /* Unroll 3x above code block until aligned or find null bytes. */
- andi. r7,r5,63
- beq cr0,L(preloop_64B)
- lvx v1,r5,r6
- vcmpequb. v1,v1,v0
- addi r5,r5,16
- addi r4,r4,-16
- bne cr6,L(found_aligning64B)
-
- andi. r7,r5,63
- beq cr0,L(preloop_64B)
- lvx v1,r5,r6
- vcmpequb. v1,v1,v0
- addi r5,r5,16
- addi r4,r4,-16
- bne cr6,L(found_aligning64B)
-
- andi. r7,r5,63
- beq cr0,L(preloop_64B)
- lvx v1,r5,r6
- vcmpequb. v1,v1,v0
- addi r5,r5,16
- addi r4,r4,-16
- bne cr6,L(found_aligning64B)
-
- /* At this point it should be 16 bytes aligned.
- Prepare for the 64B loop. */
- .p2align 4
-L(preloop_64B):
- /* Check if maxlen became is less than 64, therefore disallowing the
- 64B loop. If it happened switch to the 16B loop code. */
- cmpldi r4,64 /* Check if maxlen < 64. */
- blt L(smaller) /* If maxlen < 64. */
- /* Set some constant values. */
- li r7,16
- li r10,32
- li r9,48
-
- /* Compute the number of 64 bytes iterations needed. */
- srdi r11,r4,6 /* Compute loop count (maxlen / 64). */
- andi. r4,r4,63 /* Set maxlen the remainder (maxlen % 64). */
- mtctr r11 /* Move loop count to counter register. */
-
- /* Handle maxlen > 64. Loop over the bytes in strides of 64B. */
- .p2align 4
-L(loop_64B):
- lvx v1,r5,r6 /* r5 is the pointer to s. */
- lvx v2,r5,r7
- lvx v3,r5,r10
- lvx v4,r5,r9
- /* Compare the four 16B vectors to obtain the least 16 values.
- Null bytes should emerge into v7, then check for null bytes. */
- vminub v5,v1,v2
- vminub v6,v3,v4
- vminub v7,v5,v6
- vcmpequb. v7,v7,v0 /* Check for null bytes. */
- addi r5,r5,64 /* Add pointer to next iteraction. */
- bne cr6,L(found_64B) /* If found null bytes. */
- bdnz L(loop_64B) /* Continue the loop if count > 0. */
-
-/* Hit loop end without null match. So branch to handle the remainder. */
-
- /* Prepare a 16B loop to handle two cases:
- 1. If 32 > maxlen < 64.
- 2. If maxlen >= 64, and reached end of the 64B loop with null
- bytes not found. Thus handle the remainder bytes here. */
- .p2align 4
-L(smaller):
- cmpldi r4,0 /* Check maxlen is zero. */
- beq L(done) /* If maxlen is zero. */
-
- /* Place rounded up number of qw's to check into a vmx
- register, and use some vector tricks to minimize
- branching. */
- MTVRD(v7,r4) /* Copy maxlen from GPR to vector register. */
- vspltisb v5,1
- vspltisb v6,15
- vspltb v2,v7,7
- vaddubs v3,v5,v6
-
-#ifdef __LITTLE_ENDIAN__
- vspltish v5,1 /* Compute 16 in each byte. */
-#endif
-
- /* Loop in 16B aligned incremements now. */
- .p2align 4
-L(loop_16B):
- lvx v1,r5,r6 /* Load quadword into vector register. */
- addi r5,r5,16 /* Increment address to next 16B block. */
- vor v7,v2,v2 /* Save loop count (v2) into v7. */
- vsububs v2,v2,v3 /* Subtract 16B from count, saturate at 0. */
- vminub v4,v1,v2
- vcmpequb. v4,v4,v0 /* Checking for null bytes. */
- beq cr6,L(loop_16B) /* If null bytes not found. */
-
- vcmpequb v1,v1,v0
- VBPERMQ(v1,v1,v10)
-#ifdef __LITTLE_ENDIAN__
- vsubuhm v2,v1,v5 /* Form a mask of trailing zeros. */
- vandc v2,v2,v1
- VPOPCNTH(v1,v2) /* Count of trailing zeros, 16 if none. */
-#else
- VCLZH(v1,v1) /* Count the leading zeros, 16 if none. */
-#endif
- /* Truncate to maximum allowable offset. */
- vcmpgtub v2,v1,v7 /* Compare and truncate for matches beyond
- maxlen. */
- vsel v1,v1,v7,v2 /* 0-16 is now in byte 7. */
-
- MFVRD(r0,v1)
- addi r5,r5,-16 /* Undo speculative bump. */
- extsb r0,r0 /* Clear whatever gunk is in the high 56b. */
- add r5,r5,r0 /* Add the offset of whatever was found. */
-L(done):
- subf r3,r3,r5 /* Length is equal to the offset of null byte
- matched minus the pointer to s. */
- blr /* Done. */
-
- /* Handle case of maxlen > 64 and found null bytes in last block
- of 64 bytes read. */
- .p2align 4
-L(found_64B):
- /* A zero was found. Reduce the result. */
- vcmpequb v1,v1,v0
- vcmpequb v2,v2,v0
- vcmpequb v3,v3,v0
- vcmpequb v4,v4,v0
-
- /* Permute the first bit of each byte into bits 48-63. */
- VBPERMQ(v1,v1,v10)
- VBPERMQ(v2,v2,v10)
- VBPERMQ(v3,v3,v10)
- VBPERMQ(v4,v4,v10)
-
- /* Shift each component into its correct position for merging. */
-#ifdef __LITTLE_ENDIAN__
- vsldoi v2,v2,v2,2
- vsldoi v3,v3,v3,4
- vsldoi v4,v4,v4,6
-#else
- vsldoi v1,v1,v1,6
- vsldoi v2,v2,v2,4
- vsldoi v3,v3,v3,2
-#endif
-
- /* Merge the results and move to a GPR. */
- vor v1,v2,v1
- vor v2,v3,v4
- vor v4,v1,v2
-
- /* Adjust address to the start of the current 64B block. */
- addi r5,r5,-64
-
- MFVRD(r10,v4)
-#ifdef __LITTLE_ENDIAN__
- addi r9,r10,-1 /* Form a mask from trailing zeros. */
- andc r9,r9,r10
- popcntd r0,r9 /* Count the bits in the mask. */
-#else
- cntlzd r0,r10 /* Count leading zeros before the match. */
-#endif
- subf r5,r3,r5
- add r3,r5,r0 /* Compute final length. */
- blr /* Done. */
-
- /* Handle case where null bytes were found while aligning
- as a preparation for the 64B loop. */
- .p2align 4
-L(found_aligning64B):
- VBPERMQ(v1,v1,v10)
-#ifdef __LITTLE_ENDIAN__
- MFVRD(r10,v1)
- addi r9,r10,-1 /* Form a mask from trailing zeros. */
- andc r9,r9,r10
- popcntd r0,r9 /* Count the bits in the mask. */
-#else
- vsldoi v1,v1,v1,6
- MFVRD(r10,v1)
- cntlzd r0,r10 /* Count leading zeros before the match. */
-#endif
- addi r5,r5,-16 /* Adjust address to offset of last 16 bytes
- read. */
- /* Calculate length as subtracted the pointer to s of last 16 bytes
- offset, added with the bytes before the match. */
- subf r5,r3,r5
- add r3,r5,r0
- blr /* Done. */
-
- /* Handle case of maxlen > 32 and found a null bytes within the first
- 16 bytes of s. */
- .p2align 4
-L(early_find):
- bpermd r5,r8,r10 /* r8 contains the bit permute constants. */
- bpermd r6,r8,r11
- sldi r5,r5,8
- or r5,r5,r6 /* r5 should hold a 16B mask of
- a potential 0. */
- cntlzd r5,r5 /* Count leading zeros. */
- addi r3,r5,-48 /* Deduct the 48 leading zeros always
- present. */
- blr /* Done. */
-
- /* Handle case of maxlen <= 32. Use the POWER7 algorithm. */
- .p2align 4
-L(small_range):
- clrrdi r8,r3,3 /* Align the pointer to 8B. */
- li r0,0
- /* Register's content at this point:
- r3 == pointer to s, r4 == maxlen, r8 == pointer to s aligned to 8B,
- r7 == last acceptable address. */
- cmpldi r4,0 /* Check if maxlen is zero. */
- beq L(end_max) /* If maxlen is zero. */
-
- /* Calculate the last acceptable address and check for possible
- addition overflow by using satured math:
- r7 = r3 + r4
- r7 |= -(r7 < x) */
- add r7,r3,r4
- subfc r6,r3,r7
- subfe r9,r9,r9
- extsw r6,r9
- or r7,r7,r6
- addi r7,r7,-1
-
- clrrdi r7,r7,3 /* Align to 8B address of last
- acceptable address. */
-
- rlwinm r6,r3,3,26,28 /* Calculate padding. */
- ld r12,0(r8) /* Load aligned doubleword. */
- cmpb r10,r12,r0 /* Check for null bytes. */
-#ifdef __LITTLE_ENDIAN__
- srd r10,r10,r6
- sld r10,r10,r6
-#else
- sld r10,r10,r6
- srd r10,r10,r6
-#endif /* __LITTLE_ENDIAN__ */
- cmpldi cr7,r10,0
- bne cr7,L(done_small) /* If found null byte. */
-
- cmpld r8,r7 /* Check if reached maxlen. */
- beq L(end_max) /* If reached maxlen. */
-
- /* Still handling case of maxlen <= 32. Read doubleword aligned until
- find null bytes or reach maxlen. */
- .p2align 4
-L(loop_small):
- ldu r12,8(r8) /* Load next doubleword and update r8. */
- cmpb r10,r12,r0 /* Check for null bytes. */
- cmpldi cr6,r10,0
- bne cr6,L(done_small) /* If found null bytes. */
- cmpld r8,r7 /* Check if reached maxlen. */
- bne L(loop_small) /* If it has more bytes to read. */
- mr r3,r4 /* Reached maxlen with null bytes not found.
- Length is equal to maxlen. */
- blr /* Done. */
-
- /* Still handling case of maxlen <= 32. Found null bytes.
- Registers: r10 == match bits within doubleword, r8 == address of
- last doubleword read, r3 == pointer to s, r4 == maxlen. */
- .p2align 4
-L(done_small):
-#ifdef __LITTLE_ENDIAN__
- /* Count trailing zeros. */
- addi r0,r10,-1
- andc r0,r0,r10
- popcntd r0,r0
-#else
- cntlzd r0,r10 /* Count leading zeros before the match. */
-#endif
- sub r3,r8,r3 /* Calculate total of bytes before the match. */
- srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
- add r3,r3,r0 /* Length until the match. */
- cmpld r3,r4 /* Check length is greater than maxlen. */
- blelr
- mr r3,r4 /* If length is greater than maxlen, return
- maxlen. */
- blr
-
- /* Handle case of reached maxlen with null bytes not found. */
- .p2align 4
-L(end_max):
- mr r3,r4 /* Length is equal to maxlen. */
- blr /* Done. */
-
-
-END (__strnlen)
-libc_hidden_def (__strnlen)
-weak_alias (__strnlen, strnlen)
-libc_hidden_def (strnlen)
diff --git a/sysdeps/powerpc/powerpc64/power8/strrchr.S b/sysdeps/powerpc/powerpc64/power8/strrchr.S
deleted file mode 100644
index 8eb74853c3..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/strrchr.S
+++ /dev/null
@@ -1,464 +0,0 @@
-/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn.
- Copyright (C) 2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-/* char *[r3] strrchr (char *s [r3], int c [r4]) */
-/* TODO: change these to the actual instructions when the minimum required
- binutils allows it. */
-#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
-#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
-#define VBPERMQ(t,a,b) .long (0x1000054c \
- | ((t)<<(32-11)) \
- | ((a)<<(32-16)) \
- | ((b)<<(32-21)) )
-#define VCLZD(r,v) .long (0x100007c2 | ((r)<<(32-11)) | ((v)<<(32-21)))
-#define VPOPCNTD(r,v) .long (0x100007c3 | ((r)<<(32-11)) | ((v)<<(32-21)))
-#define VADDUQM(t,a,b) .long (0x10000100 \
- | ((t)<<(32-11)) \
- | ((a)<<(32-16)) \
- | ((b)<<(32-21)) )
-#ifdef __LITTLE_ENDIAN__
-/* Find the match position from v6 and place result in r6. */
-# define CALCULATE_MATCH() \
- VBPERMQ(v6, v6, v10); \
- vsldoi v6, v6, v6, 6; \
- MFVRD(r7, v6); \
- cntlzd r6, r7; \
- subfic r6, r6, 15;
-/*
- * Find the first null position to mask bytes after null.
- * (reg): vcmpequb result: v2 for 1st qw v3 for 2nd qw.
- * Result placed at v2.
- */
-# define FIND_NULL_POS(reg) \
- vspltisb v11, -1; \
- VADDUQM(v11, reg, v11); \
- vandc v11, v11, reg; \
- VPOPCNTD(v2, v11); \
- vspltb v11, v2, 15; \
- vcmpequb. v11, v11, v9; \
- blt cr6, 1f; \
- vsldoi v9, v0, v9, 1; \
- vslo v2, v2, v9; \
-1: \
- vsumsws v2, v2, v0;
-#else
-# define CALCULATE_MATCH() \
- VBPERMQ(v6, v6, v10); \
- MFVRD(r7, v6); \
- addi r6, r7, -1; \
- andc r6, r6, r7; \
- popcntd r6, r6; \
- subfic r6, r6, 15;
-# define FIND_NULL_POS(reg) \
- VCLZD(v2, reg); \
- vspltb v11, v2, 7; \
- vcmpequb. v11, v11, v9; \
- blt cr6, 1f; \
- vsldoi v9, v0, v9, 1; \
- vsro v2, v2, v9; \
-1: \
- vsumsws v2, v2, v0;
-#endif /* !__LITTLE_ENDIAN__ */
- .machine power7
-ENTRY (strrchr)
- CALL_MCOUNT 2
- dcbt 0,r3
- clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
- cmpdi cr7,r4,0
- ld r12,0(r8) /* Load doubleword from memory. */
- li r9,0 /* Used to store last occurence. */
- li r0,0 /* Doubleword with null chars to use
- with cmpb. */
-
- rlwinm r6,r3,3,26,28 /* Calculate padding. */
-
- beq cr7,L(null_match)
-
- /* Replicate byte to doubleword. */
- insrdi r4,r4,8,48
- insrdi r4,r4,16,32
- insrdi r4,r4,32,0
-
- /* r4 is changed now. If it's passed more chars, then
- check for null again. */
- cmpdi cr7,r4,0
- beq cr7,L(null_match)
- /* Now r4 has a doubleword of c bytes and r0 has
- a doubleword of null bytes. */
-
- cmpb r10,r12,r4 /* Compare each byte against c byte. */
- cmpb r11,r12,r0 /* Compare each byte against null byte. */
-
- /* Move the doublewords left and right to discard the bits that are
- not part of the string and bring them back as zeros. */
-#ifdef __LITTLE_ENDIAN__
- srd r10,r10,r6
- srd r11,r11,r6
- sld r10,r10,r6
- sld r11,r11,r6
-#else
- sld r10,r10,r6
- sld r11,r11,r6
- srd r10,r10,r6
- srd r11,r11,r6
-#endif
- or r5,r10,r11 /* OR the results to speed things up. */
- cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
- have been found. */
- bne cr7,L(done)
-
-L(align):
- andi. r12, r8, 15
-
- /* Are we now aligned to a doubleword boundary? If so, skip to
- the main loop. Otherwise, go through the alignment code. */
-
- bne cr0, L(loop)
-
- /* Handle WORD2 of pair. */
- ldu r12,8(r8)
- cmpb r10,r12,r4
- cmpb r11,r12,r0
- or r5,r10,r11
- cmpdi cr7,r5,0
- bne cr7,L(done)
- b L(loop) /* We branch here (rather than falling through)
- to skip the nops due to heavy alignment
- of the loop below. */
- .p2align 5
-L(loop):
- /* Load two doublewords, compare and merge in a
- single register for speed. This is an attempt
- to speed up the null-checking process for bigger strings. */
- ld r12,8(r8)
- ldu r7,16(r8)
- cmpb r10,r12,r4
- cmpb r11,r12,r0
- cmpb r6,r7,r4
- cmpb r7,r7,r0
- or r12,r10,r11
- or r5,r6,r7
- or r5,r12,r5
- cmpdi cr7,r5,0
- beq cr7,L(vector)
-
- /* OK, one (or both) of the doublewords contains a c/null byte. Check
- the first doubleword and decrement the address in case the first
- doubleword really contains a c/null byte. */
- cmpdi cr6,r12,0
- addi r8,r8,-8
- bne cr6,L(done)
-
- /* The c/null byte must be in the second doubleword. Adjust the
- address again and move the result of cmpb to r10 so we can calculate
- the pointer. */
-
- mr r10,r6
- mr r11,r7
- addi r8,r8,8
-
- /* r10/r11 have the output of the cmpb instructions, that is,
- 0xff in the same position as the c/null byte in the original
- doubleword from the string. Use that to calculate the pointer. */
-
-L(done):
- /* If there are more than one 0xff in r11, find the first position of
- 0xff in r11 and fill r10 with 0 from that position. */
- cmpdi cr7,r11,0
- beq cr7,L(no_null)
-#ifdef __LITTLE_ENDIAN__
- addi r3,r11,-1
- andc r3,r3,r11
- popcntd r0,r3
-#else
- cntlzd r0,r11
-#endif
- subfic r0,r0,63
- li r6,-1
-#ifdef __LITTLE_ENDIAN__
- srd r0,r6,r0
-#else
- sld r0,r6,r0
-#endif
- and r10,r0,r10
-L(no_null):
-#ifdef __LITTLE_ENDIAN__
- cntlzd r0,r10 /* Count leading zeros before c matches. */
- addi r3,r10,-1
- andc r3,r3,r10
- addi r10,r11,-1
- andc r10,r10,r11
- cmpld cr7,r3,r10
- bgt cr7,L(no_match)
-#else
- addi r3,r10,-1 /* Count trailing zeros before c matches. */
- andc r3,r3,r10
- popcntd r0,r3
- cmpld cr7,r11,r10
- bgt cr7,L(no_match)
-#endif
- srdi r0,r0,3 /* Convert trailing zeros to bytes. */
- subfic r0,r0,7
- add r9,r8,r0 /* Return address of the matching c byte
- or null in case c was not found. */
- li r0,0
- cmpdi cr7,r11,0 /* If r11 == 0, no null's have been found. */
- beq cr7,L(align)
-
- .align 4
-L(no_match):
- mr r3,r9
- blr
-
-/* Check the first 32B in GPR's and move to vectorized loop. */
- .p2align 5
-L(vector):
- addi r3, r8, 8
- /* Make sure 32B aligned. */
- andi. r10, r3, 31
- bne cr0, L(loop)
- vspltisb v0, 0
- /* Precompute vbpermq constant. */
- vspltisb v10, 3
- lvsl v11, r0, r0
- vslb v10, v11, v10
- MTVRD(v1, r4)
- li r5, 16
- vspltb v1, v1, 7
- /* Compare 32 bytes in each loop. */
-L(continue):
- lvx v4, 0, r3
- lvx v5, r3, r5
- vcmpequb v2, v0, v4
- vcmpequb v3, v0, v5
- vcmpequb v6, v1, v4
- vcmpequb v7, v1, v5
- vor v8, v2, v3
- vor v9, v6, v7
- vor v11, v8, v9
- vcmpequb. v11, v0, v11
- addi r3, r3, 32
- blt cr6, L(continue)
- vcmpequb. v8, v0, v8
- blt cr6, L(match)
-
- /* One (or both) of the quadwords contains c/null. */
- vspltisb v8, 2
- vspltisb v9, 5
- /* Precompute values used for comparison. */
- vsl v9, v8, v9 /* v9 = 0x4040404040404040. */
- vaddubm v8, v9, v9
- vsldoi v8, v0, v8, 1 /* v8 = 0x80. */
-
- /* Check if null is in second qw. */
- vcmpequb. v11, v0, v2
- blt cr6, L(secondqw)
-
- /* Null found in first qw. */
- addi r8, r3, -32
- /* Calculate the null position. */
- FIND_NULL_POS(v2)
- /* Check if null is in the first byte. */
- vcmpequb. v11, v0, v2
- blt cr6, L(no_match)
- vsububm v2, v8, v2
- /* Mask unwanted bytes after null. */
-#ifdef __LITTLE_ENDIAN__
- vslo v6, v6, v2
- vsro v6, v6, v2
-#else
- vsro v6, v6, v2
- vslo v6, v6, v2
-#endif
- vcmpequb. v11, v0, v6
- blt cr6, L(no_match)
- /* Found a match before null. */
- CALCULATE_MATCH()
- add r3, r8, r6
- blr
-
-L(secondqw):
- addi r8, r3, -16
- FIND_NULL_POS(v3)
- vcmpequb. v11, v0, v2
- blt cr6, L(no_match1)
- vsububm v2, v8, v2
- /* Mask unwanted bytes after null. */
-#ifdef __LITTLE_ENDIAN__
- vslo v7, v7, v2
- vsro v7, v7, v2
-#else
- vsro v7, v7, v2
- vslo v7, v7, v2
-#endif
- vcmpequb. v11, v0, v7
- blt cr6, L(no_match1)
- addi r8, r8, 16
- vor v6, v0, v7
-L(no_match1):
- addi r8, r8, -16
- vcmpequb. v11, v0, v6
- blt cr6, L(no_match)
- /* Found a match before null. */
- CALCULATE_MATCH()
- add r3, r8, r6
- blr
-
-L(match):
- /* One (or both) of the quadwords contains a match. */
- mr r8, r3
- vcmpequb. v8, v0, v7
- blt cr6, L(firstqw)
- /* Match found in second qw. */
- addi r8, r8, 16
- vor v6, v0, v7
-L(firstqw):
- addi r8, r8, -32
- CALCULATE_MATCH()
- add r9, r8, r6 /* Compute final length. */
- b L(continue)
-/* We are here because strrchr was called with a null byte. */
- .align 4
-L(null_match):
- /* r0 has a doubleword of null bytes. */
-
- cmpb r5,r12,r0 /* Compare each byte against null bytes. */
-
- /* Move the doublewords left and right to discard the bits that are
- not part of the string and bring them back as zeros. */
-#ifdef __LITTLE_ENDIAN__
- srd r5,r5,r6
- sld r5,r5,r6
-#else
- sld r5,r5,r6
- srd r5,r5,r6
-#endif
- cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
- have been found. */
- bne cr7,L(done_null)
-
- andi. r12, r8, 15
-
- /* Are we now aligned to a quadword boundary? If so, skip to
- the main loop. Otherwise, go through the alignment code. */
-
- bne cr0, L(loop_null)
-
- /* Handle WORD2 of pair. */
- ldu r12,8(r8)
- cmpb r5,r12,r0
- cmpdi cr7,r5,0
- bne cr7,L(done_null)
- b L(loop_null) /* We branch here (rather than falling through)
- to skip the nops due to heavy alignment
- of the loop below. */
-
- /* Main loop to look for the end of the string. Since it's a
- small loop (< 8 instructions), align it to 32-bytes. */
- .p2align 5
-L(loop_null):
- /* Load two doublewords, compare and merge in a
- single register for speed. This is an attempt
- to speed up the null-checking process for bigger strings. */
- ld r12,8(r8)
- ldu r11,16(r8)
- cmpb r5,r12,r0
- cmpb r10,r11,r0
- or r6,r5,r10
- cmpdi cr7,r6,0
- beq cr7,L(vector1)
-
- /* OK, one (or both) of the doublewords contains a null byte. Check
- the first doubleword and decrement the address in case the first
- doubleword really contains a null byte. */
-
- cmpdi cr6,r5,0
- addi r8,r8,-8
- bne cr6,L(done_null)
-
- /* The null byte must be in the second doubleword. Adjust the address
- again and move the result of cmpb to r10 so we can calculate the
- pointer. */
-
- mr r5,r10
- addi r8,r8,8
-
- /* r5 has the output of the cmpb instruction, that is, it contains
- 0xff in the same position as the null byte in the original
- doubleword from the string. Use that to calculate the pointer. */
-L(done_null):
-#ifdef __LITTLE_ENDIAN__
- addi r0,r5,-1
- andc r0,r0,r5
- popcntd r0,r0
-#else
- cntlzd r0,r5 /* Count leading zeros before the match. */
-#endif
- srdi r0,r0,3 /* Convert trailing zeros to bytes. */
- add r3,r8,r0 /* Return address of the matching null byte. */
- blr
-/* Check the first 32B in GPR's and move to vectorized loop. */
- .p2align 5
-L(vector1):
- addi r3, r8, 8
- /* Make sure 32B aligned. */
- andi. r10, r3, 31
- bne cr0, L(loop_null)
- vspltisb v0, 0
- /* Precompute vbpermq constant. */
- vspltisb v10, 3
- lvsl v11, r0, r0
- vslb v10, v11, v10
- li r5, 16
- /* Compare 32 bytes in each loop. */
-L(continue1):
- lvx v4, 0, r3
- lvx v5, r3, r5
- vcmpequb v2, v0, v4
- vcmpequb v3, v0, v5
- vor v8, v2, v3
- vcmpequb. v11, v0, v8
- addi r3, r3, 32
- blt cr6, L(continue1)
- addi r3, r3, -32
- VBPERMQ(v2, v2, v10)
- VBPERMQ(v3, v3, v10)
- /* Shift each component into its correct position for merging. */
-#ifdef __LITTLE_ENDIAN__
- vsldoi v3, v3, v3, 2
-#else
- vsldoi v2, v2, v2, 6
- vsldoi v3, v3, v3, 4
-#endif
- /* Merge the results and move to a GPR. */
- vor v4, v3, v2
- MFVRD(r5, v4)
-#ifdef __LITTLE_ENDIAN__
- addi r6, r5, -1
- andc r6, r6, r5
- popcntd r6, r6
-#else
- cntlzd r6, r5 /* Count leading zeros before the match. */
-#endif
- add r3, r3, r6 /* Compute final length. */
- blr
-END (strrchr)
-weak_alias (strrchr, rindex)
-libc_hidden_builtin_def (strrchr)
diff --git a/sysdeps/powerpc/powerpc64/power8/strspn.S b/sysdeps/powerpc/powerpc64/power8/strspn.S
deleted file mode 100644
index e9271898f2..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/strspn.S
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Optimized strspn implementation for Power8.
-
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* size_t [r3] strspn (const char *string [r3],
- const char *needleAccept [r4]) */
-
-/* This takes a novel approach by computing a 256 bit mask whereby
- each set bit implies the byte is "accepted". P8 vector hardware
- has extremely efficient hardware for selecting bits from a mask.
-
- One might ask "why not use bpermd for short strings"? It is
- so slow that its performance about matches the generic PPC64
- variant without any fancy masking, with the added expense of
- making the mask. That was the first variant of this. */
-
-
-
-#include "sysdep.h"
-
-#ifndef USE_AS_STRCSPN
-# define USE_AS_STRCSPN 0
-# ifndef STRSPN
-# define STRSPN strspn
-# endif
-# define INITIAL_MASK 0
-# define UPDATE_MASK(RA, RS, RB) or RA, RS, RB
-#else
-# ifndef STRSPN
-# define STRSPN strcspn
-# endif
-# define INITIAL_MASK -1
-# define UPDATE_MASK(RA, RS, RB) andc RA, RS, RB
-#endif
-
-/* Simple macro to use VSX instructions in overlapping VR's. */
-#define XXVR(insn, vrt, vra, vrb) \
- insn 32+vrt, 32+vra, 32+vrb
-
-/* ISA 2.07B instructions are not all defined for older binutils.
- Macros are defined below for these newer instructions in order
- to maintain compatibility. */
-
-/* Note, TX/SX is always set as VMX regs are the high 32 VSX regs. */
-#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
-#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
-
-#define VBPERMQ(t,a,b) .long (0x1000054c \
- | ((t)<<(32-11)) \
- | ((a)<<(32-16)) \
- | ((b)<<(32-21)) )
-
- /* This can be updated to power8 once the minimum version of
- binutils supports power8 and the above instructions. */
- .machine power7
-EALIGN(STRSPN, 4, 0)
- CALL_MCOUNT 2
-
- /* Generate useful constants for later on. */
- vspltisb v1, 7
- vspltisb v2, -1
- vslb v1, v1, v1 /* 0x80 to swap high bit for vbpermq. */
- vspltisb v10, 0
- vsldoi v4, v10, v2, 2 /* 0xFFFF into vr4. */
- XXVR(xxmrgld, v4, v4, v10) /* Mask for checking matches. */
-
- /* Prepare to compute 256b mask. */
- addi r4, r4, -1
- li r5, INITIAL_MASK
- li r6, INITIAL_MASK
- li r7, INITIAL_MASK
- li r8, INITIAL_MASK
-
-#if USE_AS_STRCSPN
- /* Ensure the null character never matches by clearing ISA bit 0 in
- in r5 which is the bit which will check for it in the later usage
- of vbpermq. */
- srdi r5, r5, 1
-#endif
-
- li r11, 1
- sldi r11, r11, 63
-
- /* Start interleaved Mask computation.
- This will eventually or 1's into ignored bits from vbpermq. */
- lvsr v11, 0, r3
- vspltb v11, v11, 0 /* Splat shift constant. */
-
- /* Build a 256b mask in r5-r8. */
- .align 4
-L(next_needle):
- lbzu r9, 1(r4)
-
- cmpldi cr0, r9, 0
- cmpldi cr1, r9, 128
-
- /* This is a little tricky. srd only uses the first 7 bits,
- and if bit 7 is set, value is always 0. So, we can
- effectively shift 128b in this case. */
- xori r12, r9, 0x40 /* Invert bit 6. */
- srd r10, r11, r9 /* Mask for bits 0-63. */
- srd r12, r11, r12 /* Mask for bits 64-127. */
-
- beq cr0, L(start_cmp)
-
- /* Now, or the value into the correct GPR. */
- bge cr1,L(needle_gt128)
- UPDATE_MASK (r5, r5, r10) /* 0 - 63. */
- UPDATE_MASK (r6, r6, r12) /* 64 - 127. */
- b L(next_needle)
-
- .align 4
-L(needle_gt128):
- UPDATE_MASK (r7, r7, r10) /* 128 - 191. */
- UPDATE_MASK (r8, r8, r12) /* 192 - 255. */
- b L(next_needle)
-
-
- .align 4
-L(start_cmp):
- /* Move and merge bitmap into 2 VRs. bpermd is slower on P8. */
- mr r0, r3 /* Save r3 for final length computation. */
- MTVRD (v5, r5)
- MTVRD (v6, r6)
- MTVRD (v7, r7)
- MTVRD (v8, r8)
-
- /* Continue interleaved mask generation. */
-#ifdef __LITTLE_ENDIAN__
- vsrw v11, v2, v11 /* Note, shift ignores higher order bits. */
- vsplth v11, v11, 0 /* Only care about the high 16 bits of v10. */
-#else
- vslw v11, v2, v11 /* Note, shift ignores higher order bits. */
- vsplth v11, v11, 1 /* Only care about the low 16 bits of v10. */
-#endif
- lvx v0, 0, r3 /* Note, unaligned load ignores lower bits. */
-
- /* Do the merging of the bitmask. */
- XXVR(xxmrghd, v5, v5, v6)
- XXVR(xxmrghd, v6, v7, v8)
-
- /* Finish mask generation. */
- vand v11, v11, v4 /* Throwaway bits not in the mask. */
-
- /* Compare the first 1-16B, while masking unwanted bytes. */
- clrrdi r3, r3, 4 /* Note, counts from qw boundaries. */
- vxor v9, v0, v1 /* Swap high bit. */
- VBPERMQ (v8, v5, v0)
- VBPERMQ (v7, v6, v9)
- vor v7, v7, v8
- vor v7, v7, v11 /* Ignore non-participating bytes. */
- vcmpequh. v8, v7, v4
- bnl cr6, L(done)
-
- addi r3, r3, 16
-
- .align 4
-L(vec):
- lvx v0, 0, r3
- addi r3, r3, 16
- vxor v9, v0, v1 /* Swap high bit. */
- VBPERMQ (v8, v5, v0)
- VBPERMQ (v7, v6, v9)
- vor v7, v7, v8
- vcmpequh. v8, v7, v4
- blt cr6, L(vec)
-
- addi r3, r3, -16
-L(done):
- subf r3, r0, r3
- MFVRD (r10, v7)
-
-#ifdef __LITTLE_ENDIAN__
- addi r0, r10, 1 /* Count the trailing 1's. */
- andc r10, r10, r0
- popcntd r10, r10
-#else
- xori r10, r10, 0xffff /* Count leading 1's by inverting. */
- addi r3, r3, -48 /* Account for the extra leading zeros. */
- cntlzd r10, r10
-#endif
-
- add r3, r3, r10
- blr
-
-END(STRSPN)
-libc_hidden_builtin_def (STRSPN)