diff options
Diffstat (limited to 'REORG.TODO/sysdeps/powerpc/power4')
-rw-r--r-- | REORG.TODO/sysdeps/powerpc/power4/fpu/Makefile | 7 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/powerpc/power4/fpu/mpa-arch.h | 56 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/powerpc/power4/fpu/mpa.c | 214 | ||||
-rw-r--r-- | REORG.TODO/sysdeps/powerpc/power4/wordcopy.c | 212 |
4 files changed, 489 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/powerpc/power4/fpu/Makefile b/REORG.TODO/sysdeps/powerpc/power4/fpu/Makefile new file mode 100644 index 0000000000..e17d32f30e --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/power4/fpu/Makefile @@ -0,0 +1,7 @@ +# Makefile fragment for POWER4/5/5+ with FPU. + +ifeq ($(subdir),math) +CFLAGS-mpa.c += --param max-unroll-times=4 -funroll-loops -fpeel-loops +CPPFLAGS-slowpow.c += -DUSE_LONG_DOUBLE_FOR_MP=1 +CPPFLAGS-slowexp.c += -DUSE_LONG_DOUBLE_FOR_MP=1 +endif diff --git a/REORG.TODO/sysdeps/powerpc/power4/fpu/mpa-arch.h b/REORG.TODO/sysdeps/powerpc/power4/fpu/mpa-arch.h new file mode 100644 index 0000000000..4754c1b0f9 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/power4/fpu/mpa-arch.h @@ -0,0 +1,56 @@ +/* Overridable constants and operations. + Copyright (C) 2013-2017 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. */ + +typedef double mantissa_t; +typedef double mantissa_store_t; + +#define TWOPOW(i) (0x1.0p##i) + +#define RADIX TWOPOW (24) /* 2^24 */ +#define CUTTER TWOPOW (76) /* 2^76 */ +#define RADIXI 0x1.0p-24 /* 2^-24 */ +#define TWO52 TWOPOW (52) /* 2^52 */ + +/* Divide D by RADIX and put the remainder in R. */ +#define DIV_RADIX(d,r) \ + ({ \ + double u = ((d) + CUTTER) - CUTTER; \ + if (u > (d)) \ + u -= RADIX; \ + r = (d) - u; \ + (d) = u * RADIXI; \ + }) + +/* Put the integer component of a double X in R and retain the fraction in + X. */ +#define INTEGER_OF(x, r) \ + ({ \ + double u = ((x) + TWO52) - TWO52; \ + if (u > (x)) \ + u -= 1; \ + (r) = u; \ + (x) -= u; \ + }) + +/* Align IN down to a multiple of F, where F is a power of two. */ +#define ALIGN_DOWN_TO(in, f) \ + ({ \ + double factor = f * TWO52; \ + double u = (in + factor) - factor; \ + if (u > in) \ + u -= f; \ + u; \ + }) diff --git a/REORG.TODO/sysdeps/powerpc/power4/fpu/mpa.c b/REORG.TODO/sysdeps/powerpc/power4/fpu/mpa.c new file mode 100644 index 0000000000..0a0f7175b4 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/power4/fpu/mpa.c @@ -0,0 +1,214 @@ + +/* + * IBM Accurate Mathematical Library + * written by International Business Machines Corp. + * Copyright (C) 2001-2017 Free Software Foundation, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +/* Define __mul and __sqr and use the rest from generic code. */ +#define NO__MUL +#define NO__SQR + +#include <sysdeps/ieee754/dbl-64/mpa.c> + +/* Multiply *X and *Y and store result in *Z. X and Y may overlap but not X + and Z or Y and Z. For P in [1, 2, 3], the exact result is truncated to P + digits. In case P > 3 the error is bounded by 1.001 ULP. */ +void +__mul (const mp_no *x, const mp_no *y, mp_no *z, int p) +{ + long i, i1, i2, j, k, k2; + long p2 = p; + double u, zk, zk2; + + /* Is z=0? */ + if (__glibc_unlikely (X[0] * Y[0] == 0)) + { + Z[0] = 0; + return; + } + + /* Multiply, add and carry */ + k2 = (p2 < 3) ? p2 + p2 : p2 + 3; + zk = Z[k2] = 0; + for (k = k2; k > 1;) + { + if (k > p2) + { + i1 = k - p2; + i2 = p2 + 1; + } + else + { + i1 = 1; + i2 = k; + } +#if 1 + /* Rearrange this inner loop to allow the fmadd instructions to be + independent and execute in parallel on processors that have + dual symmetrical FP pipelines. */ + if (i1 < (i2 - 1)) + { + /* Make sure we have at least 2 iterations. */ + if (((i2 - i1) & 1L) == 1L) + { + /* Handle the odd iterations case. */ + zk2 = x->d[i2 - 1] * y->d[i1]; + } + else + zk2 = 0.0; + /* Do two multiply/adds per loop iteration, using independent + accumulators; zk and zk2. */ + for (i = i1, j = i2 - 1; i < i2 - 1; i += 2, j -= 2) + { + zk += x->d[i] * y->d[j]; + zk2 += x->d[i + 1] * y->d[j - 1]; + } + zk += zk2; /* Final sum. */ + } + else + { + /* Special case when iterations is 1. */ + zk += x->d[i1] * y->d[i1]; + } +#else + /* The original code. */ + for (i = i1, j = i2 - 1; i < i2; i++, j--) + zk += X[i] * Y[j]; +#endif + + u = (zk + CUTTER) - CUTTER; + if (u > zk) + u -= RADIX; + Z[k] = zk - u; + zk = u * RADIXI; + --k; + } + Z[k] = zk; + + int e = EX + EY; + /* Is there a carry beyond the most significant digit? */ + if (Z[1] == 0) + { + for (i = 1; i <= p2; i++) + Z[i] = Z[i + 1]; + e--; + } + + EZ = e; + Z[0] = X[0] * Y[0]; +} + +/* Square *X and store result in *Y. X and Y may not overlap. For P in + [1, 2, 3], the exact result is truncated to P digits. In case P > 3 the + error is bounded by 1.001 ULP. This is a faster special case of + multiplication. */ +void +__sqr (const mp_no *x, mp_no *y, int p) +{ + long i, j, k, ip; + double u, yk; + + /* Is z=0? */ + if (__glibc_unlikely (X[0] == 0)) + { + Y[0] = 0; + return; + } + + /* We need not iterate through all X's since it's pointless to + multiply zeroes. */ + for (ip = p; ip > 0; ip--) + if (X[ip] != 0) + break; + + k = (__glibc_unlikely (p < 3)) ? p + p : p + 3; + + while (k > 2 * ip + 1) + Y[k--] = 0; + + yk = 0; + + while (k > p) + { + double yk2 = 0.0; + long lim = k / 2; + + if (k % 2 == 0) + { + yk += X[lim] * X[lim]; + lim--; + } + + /* In __mul, this loop (and the one within the next while loop) run + between a range to calculate the mantissa as follows: + + Z[k] = X[k] * Y[n] + X[k+1] * Y[n-1] ... + X[n-1] * Y[k+1] + + X[n] * Y[k] + + For X == Y, we can get away with summing halfway and doubling the + result. For cases where the range size is even, the mid-point needs + to be added separately (above). */ + for (i = k - p, j = p; i <= lim; i++, j--) + yk2 += X[i] * X[j]; + + yk += 2.0 * yk2; + + u = (yk + CUTTER) - CUTTER; + if (u > yk) + u -= RADIX; + Y[k--] = yk - u; + yk = u * RADIXI; + } + + while (k > 1) + { + double yk2 = 0.0; + long lim = k / 2; + + if (k % 2 == 0) + { + yk += X[lim] * X[lim]; + lim--; + } + + /* Likewise for this loop. */ + for (i = 1, j = k - 1; i <= lim; i++, j--) + yk2 += X[i] * X[j]; + + yk += 2.0 * yk2; + + u = (yk + CUTTER) - CUTTER; + if (u > yk) + u -= RADIX; + Y[k--] = yk - u; + yk = u * RADIXI; + } + Y[k] = yk; + + /* Squares are always positive. */ + Y[0] = 1.0; + + int e = EX * 2; + /* Is there a carry beyond the most significant digit? */ + if (__glibc_unlikely (Y[1] == 0)) + { + for (i = 1; i <= p; i++) + Y[i] = Y[i + 1]; + e--; + } + EY = e; +} diff --git a/REORG.TODO/sysdeps/powerpc/power4/wordcopy.c b/REORG.TODO/sysdeps/powerpc/power4/wordcopy.c new file mode 100644 index 0000000000..2648e25c58 --- /dev/null +++ b/REORG.TODO/sysdeps/powerpc/power4/wordcopy.c @@ -0,0 +1,212 @@ +/* _memcopy.c -- subroutines for memory copy functions. + Copyright (C) 1991-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Torbjorn Granlund (tege@sics.se). + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* BE VERY CAREFUL IF YOU CHANGE THIS CODE...! */ + +#include <stddef.h> +#include <memcopy.h> + +/* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to + block beginning at DSTP with LEN `op_t' words (not LEN bytes!). + Both SRCP and DSTP should be aligned for memory operations on `op_t's. */ + +#ifndef WORDCOPY_FWD_ALIGNED +# define WORDCOPY_FWD_ALIGNED _wordcopy_fwd_aligned +#endif + +void +WORDCOPY_FWD_ALIGNED (long int dstp, long int srcp, size_t len) +{ + op_t a0, a1; + + if (len & 1) + { + ((op_t *) dstp)[0] = ((op_t *) srcp)[0]; + + if (len == 1) + return; + srcp += OPSIZ; + dstp += OPSIZ; + len -= 1; + } + + do + { + a0 = ((op_t *) srcp)[0]; + a1 = ((op_t *) srcp)[1]; + ((op_t *) dstp)[0] = a0; + ((op_t *) dstp)[1] = a1; + + srcp += 2 * OPSIZ; + dstp += 2 * OPSIZ; + len -= 2; + } + while (len != 0); +} + +/* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to + block beginning at DSTP with LEN `op_t' words (not LEN bytes!). + DSTP should be aligned for memory operations on `op_t's, but SRCP must + *not* be aligned. */ + +#ifndef WORDCOPY_FWD_DEST_ALIGNED +# define WORDCOPY_FWD_DEST_ALIGNED _wordcopy_fwd_dest_aligned +#endif + +void +WORDCOPY_FWD_DEST_ALIGNED (long int dstp, long int srcp, size_t len) +{ + op_t a0, a1, a2; + int sh_1, sh_2; + + /* Calculate how to shift a word read at the memory operation + aligned srcp to make it aligned for copy. */ + + sh_1 = 8 * (srcp % OPSIZ); + sh_2 = 8 * OPSIZ - sh_1; + + /* Make SRCP aligned by rounding it down to the beginning of the `op_t' + it points in the middle of. */ + srcp &= -OPSIZ; + a0 = ((op_t *) srcp)[0]; + + if (len & 1) + { + a1 = ((op_t *) srcp)[1]; + ((op_t *) dstp)[0] = MERGE (a0, sh_1, a1, sh_2); + + if (len == 1) + return; + + a0 = a1; + srcp += OPSIZ; + dstp += OPSIZ; + len -= 1; + } + + do + { + a1 = ((op_t *) srcp)[1]; + a2 = ((op_t *) srcp)[2]; + ((op_t *) dstp)[0] = MERGE (a0, sh_1, a1, sh_2); + ((op_t *) dstp)[1] = MERGE (a1, sh_1, a2, sh_2); + a0 = a2; + + srcp += 2 * OPSIZ; + dstp += 2 * OPSIZ; + len -= 2; + } + while (len != 0); +} + +/* _wordcopy_bwd_aligned -- Copy block finishing right before + SRCP to block finishing right before DSTP with LEN `op_t' words + (not LEN bytes!). Both SRCP and DSTP should be aligned for memory + operations on `op_t's. */ + +#ifndef WORDCOPY_BWD_ALIGNED +# define WORDCOPY_BWD_ALIGNED _wordcopy_bwd_aligned +#endif + +void +WORDCOPY_BWD_ALIGNED (long int dstp, long int srcp, size_t len) +{ + op_t a0, a1; + + if (len & 1) + { + srcp -= OPSIZ; + dstp -= OPSIZ; + ((op_t *) dstp)[0] = ((op_t *) srcp)[0]; + + if (len == 1) + return; + len -= 1; + } + + do + { + srcp -= 2 * OPSIZ; + dstp -= 2 * OPSIZ; + + a1 = ((op_t *) srcp)[1]; + a0 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[1] = a1; + ((op_t *) dstp)[0] = a0; + + len -= 2; + } + while (len != 0); +} + +/* _wordcopy_bwd_dest_aligned -- Copy block finishing right + before SRCP to block finishing right before DSTP with LEN `op_t' + words (not LEN bytes!). DSTP should be aligned for memory + operations on `op_t', but SRCP must *not* be aligned. */ + +#ifndef WORDCOPY_BWD_DEST_ALIGNED +# define WORDCOPY_BWD_DEST_ALIGNED _wordcopy_bwd_dest_aligned +#endif + +void +WORDCOPY_BWD_DEST_ALIGNED (long int dstp, long int srcp, size_t len) +{ + op_t a0, a1, a2; + int sh_1, sh_2; + + /* Calculate how to shift a word read at the memory operation + aligned srcp to make it aligned for copy. */ + + sh_1 = 8 * (srcp % OPSIZ); + sh_2 = 8 * OPSIZ - sh_1; + + /* Make srcp aligned by rounding it down to the beginning of the op_t + it points in the middle of. */ + srcp &= -OPSIZ; + a2 = ((op_t *) srcp)[0]; + + if (len & 1) + { + srcp -= OPSIZ; + dstp -= OPSIZ; + a1 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = MERGE (a1, sh_1, a2, sh_2); + + if (len == 1) + return; + + a2 = a1; + len -= 1; + } + + do + { + srcp -= 2 * OPSIZ; + dstp -= 2 * OPSIZ; + + a1 = ((op_t *) srcp)[1]; + a0 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[1] = MERGE (a1, sh_1, a2, sh_2); + ((op_t *) dstp)[0] = MERGE (a0, sh_1, a1, sh_2); + a2 = a0; + + len -= 2; + } + while (len != 0); +} |