diff options
author | Adhemerval Zanella <adhemerval.zanella@linaro.org> | 2019-03-20 15:34:10 +0000 |
---|---|---|
committer | Adhemerval Zanella <adhemerval.zanella@linaro.org> | 2019-04-29 08:43:03 -0300 |
commit | c4c0848bbb7a4ad6ab8149abf982a0f10fd2821b (patch) | |
tree | 756bcbb6fcf2ff9c7905cb813364b2b2cdc53411 /sysdeps | |
parent | f492fc99f48e1a4fe99d9014f92b91fd3fc18fb0 (diff) | |
download | glibc-c4c0848bbb7a4ad6ab8149abf982a0f10fd2821b.tar glibc-c4c0848bbb7a4ad6ab8149abf982a0f10fd2821b.tar.gz glibc-c4c0848bbb7a4ad6ab8149abf982a0f10fd2821b.tar.bz2 glibc-c4c0848bbb7a4ad6ab8149abf982a0f10fd2821b.zip |
powerpc: Remove power4 mpa optimization
This patch removes the POWER4 optimized mpa optimization used currently
on all powerpc targets. In fact for newer chips, GCC generates *worse*
code than generic implementation as below. One possibilty would to
add ifunc variants for the mpa routines (as x86_64), but it will add
complexity only for older chips (and one would need to check if
power5, power5+, and power6 do benefict from this optimization),
and only for specific implementation (since most used one such
as sin, cos, exp, pow where optimized to avoid calling the slow
multiprecision path).
* POWER9 patched
$ ./testrun.sh benchtests/bench-atan
"atan": {
"": {
"duration": 5.12565e+09,
"iterations": 1.552e+08,
"max": 100.552,
"min": 7.799,
"mean": 33.0261
},
"144bits": {
"duration": 5.12745e+09,
"iterations": 825000,
"max": 7517.17,
"min": 6186.3,
"mean": 6215.09
}
}
$ ./testrun.sh benchtests/bench-acos
"acos": {
"": {
"duration": 5.21741e+09,
"iterations": 1.269e+08,
"max": 191.738,
"min": 7.931,
"mean": 41.1144
},
"slow": {
"duration": 5.25999e+09,
"iterations": 198000,
"max": 26681.7,
"min": 26463.6,
"mean": 26565.6
}
}
* POWER9 master
$ ./testrun.sh benchtests/bench-atan
"atan": {
"": {
"duration": 5.12815e+09,
"iterations": 1.552e+08,
"max": 134.788,
"min": 7.803,
"mean": 33.0422
},
"144bits": {
"duration": 5.1209e+09,
"iterations": 447000,
"max": 11615.8,
"min": 11301.8,
"mean": 11456.2
}
}
$ ./testrun.sh benchtests/bench-acos
"acos": {
"": {
"duration": 5.22272e+09,
"iterations": 1.269e+08,
"max": 115.981,
"min": 7.931,
"mean": 41.1562
},
"slow": {
"duration": 5.28723e+09,
"iterations": 96000,
"max": 55434.1,
"min": 54820.6,
"mean": 55075.3
}
}
* POWER8 patched
$ taskset -c 16 ./testrun.sh benchtests/bench-acos
"acos": {
"": {
"duration": 5.16398e+09,
"iterations": 9.99e+07,
"max": 174.408,
"min": 8.645,
"mean": 51.6915
},
"slow": {
"duration": 5.16982e+09,
"iterations": 96000,
"max": 54830.5,
"min": 53703.8,
"mean": 53852.3
}
}
* POWER8 master
$ taskset -c 16 ./testrun.sh benchtests/bench-acos
"acos": {
"": {
"duration": 5.17019e+09,
"iterations": 9.99e+07,
"max": 186.127,
"min": 8.633,
"mean": 51.7537
},
"slow": {
"duration": 5.34225e+09,
"iterations": 90000,
"max": 60353.2,
"min": 59155.3,
"mean": 59358.4
}
}
* POWER7 patched
$ taskset -c 16 benchtests/bench-asin
"asin": {
"": {
"duration": 5.15559e+09,
"iterations": 6.5e+07,
"max": 193.335,
"min": 12.227,
"mean": 79.3168
},
"slow": {
"duration": 5.20538e+09,
"iterations": 80000,
"max": 65705.2,
"min": 64299.4,
"mean": 65067.3
}
}
* POWER7 master
$ taskset -c 16 benchtests/bench-asin
"asin": {
"": {
"duration": 5.15446e+09,
"iterations": 6.5e+07,
"max": 184.575,
"min": 12.226,
"mean": 79.2994
},
"slow": {
"duration": 5.20616e+09,
"iterations": 80000,
"max": 65705.1,
"min": 64336.6,
"mean": 65076.9
}
}
Checked on powerpc-linux-gnu (built without --with-cpu, with
--with-cpu=power4 and with --with-cpu=power5+ and --disable-multi-arch),
powerpc64-linux-gnu (built without --with-cp and with --with-cpu=power5+
and --disable-multi-arch).
* sysdeps/powerpc/power4/fpu/Makefile: Remove file.
* sysdeps/powerpc/power4/fpu/mpa-arch.h: Likewise.
* sysdeps/powerpc/power4/fpu/mpa.c: Likewise.
Reviewed-by: Gabriel F. T. Gomes <gabriel@inconstante.eti.br>
Diffstat (limited to 'sysdeps')
-rw-r--r-- | sysdeps/powerpc/power4/fpu/Makefile | 5 | ||||
-rw-r--r-- | sysdeps/powerpc/power4/fpu/mpa-arch.h | 56 | ||||
-rw-r--r-- | sysdeps/powerpc/power4/fpu/mpa.c | 214 |
3 files changed, 0 insertions, 275 deletions
diff --git a/sysdeps/powerpc/power4/fpu/Makefile b/sysdeps/powerpc/power4/fpu/Makefile deleted file mode 100644 index f487ed6014..0000000000 --- a/sysdeps/powerpc/power4/fpu/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -# Makefile fragment for POWER4/5/5+ with FPU. - -ifeq ($(subdir),math) -CFLAGS-mpa.c += --param max-unroll-times=4 -funroll-loops -fpeel-loops -endif diff --git a/sysdeps/powerpc/power4/fpu/mpa-arch.h b/sysdeps/powerpc/power4/fpu/mpa-arch.h deleted file mode 100644 index 929c60b314..0000000000 --- a/sysdeps/powerpc/power4/fpu/mpa-arch.h +++ /dev/null @@ -1,56 +0,0 @@ -/* Overridable constants and operations. - Copyright (C) 2013-2019 Free Software Foundation, Inc. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as published by - the Free Software Foundation; either version 2.1 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public License - along with this program; if not, see <http://www.gnu.org/licenses/>. */ - -typedef double mantissa_t; -typedef double mantissa_store_t; - -#define TWOPOW(i) (0x1.0p##i) - -#define RADIX TWOPOW (24) /* 2^24 */ -#define CUTTER TWOPOW (76) /* 2^76 */ -#define RADIXI 0x1.0p-24 /* 2^-24 */ -#define TWO52 TWOPOW (52) /* 2^52 */ - -/* Divide D by RADIX and put the remainder in R. */ -#define DIV_RADIX(d,r) \ - ({ \ - double u = ((d) + CUTTER) - CUTTER; \ - if (u > (d)) \ - u -= RADIX; \ - r = (d) - u; \ - (d) = u * RADIXI; \ - }) - -/* Put the integer component of a double X in R and retain the fraction in - X. */ -#define INTEGER_OF(x, r) \ - ({ \ - double u = ((x) + TWO52) - TWO52; \ - if (u > (x)) \ - u -= 1; \ - (r) = u; \ - (x) -= u; \ - }) - -/* Align IN down to a multiple of F, where F is a power of two. */ -#define ALIGN_DOWN_TO(in, f) \ - ({ \ - double factor = f * TWO52; \ - double u = (in + factor) - factor; \ - if (u > in) \ - u -= f; \ - u; \ - }) diff --git a/sysdeps/powerpc/power4/fpu/mpa.c b/sysdeps/powerpc/power4/fpu/mpa.c deleted file mode 100644 index 1be2e93cb7..0000000000 --- a/sysdeps/powerpc/power4/fpu/mpa.c +++ /dev/null @@ -1,214 +0,0 @@ - -/* - * IBM Accurate Mathematical Library - * written by International Business Machines Corp. - * Copyright (C) 2001-2019 Free Software Foundation, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. - */ - -/* Define __mul and __sqr and use the rest from generic code. */ -#define NO__MUL -#define NO__SQR - -#include <sysdeps/ieee754/dbl-64/mpa.c> - -/* Multiply *X and *Y and store result in *Z. X and Y may overlap but not X - and Z or Y and Z. For P in [1, 2, 3], the exact result is truncated to P - digits. In case P > 3 the error is bounded by 1.001 ULP. */ -void -__mul (const mp_no *x, const mp_no *y, mp_no *z, int p) -{ - long i, i1, i2, j, k, k2; - long p2 = p; - double u, zk, zk2; - - /* Is z=0? */ - if (__glibc_unlikely (X[0] * Y[0] == 0)) - { - Z[0] = 0; - return; - } - - /* Multiply, add and carry */ - k2 = (p2 < 3) ? p2 + p2 : p2 + 3; - zk = Z[k2] = 0; - for (k = k2; k > 1;) - { - if (k > p2) - { - i1 = k - p2; - i2 = p2 + 1; - } - else - { - i1 = 1; - i2 = k; - } -#if 1 - /* Rearrange this inner loop to allow the fmadd instructions to be - independent and execute in parallel on processors that have - dual symmetrical FP pipelines. */ - if (i1 < (i2 - 1)) - { - /* Make sure we have at least 2 iterations. */ - if (((i2 - i1) & 1L) == 1L) - { - /* Handle the odd iterations case. */ - zk2 = x->d[i2 - 1] * y->d[i1]; - } - else - zk2 = 0.0; - /* Do two multiply/adds per loop iteration, using independent - accumulators; zk and zk2. */ - for (i = i1, j = i2 - 1; i < i2 - 1; i += 2, j -= 2) - { - zk += x->d[i] * y->d[j]; - zk2 += x->d[i + 1] * y->d[j - 1]; - } - zk += zk2; /* Final sum. */ - } - else - { - /* Special case when iterations is 1. */ - zk += x->d[i1] * y->d[i1]; - } -#else - /* The original code. */ - for (i = i1, j = i2 - 1; i < i2; i++, j--) - zk += X[i] * Y[j]; -#endif - - u = (zk + CUTTER) - CUTTER; - if (u > zk) - u -= RADIX; - Z[k] = zk - u; - zk = u * RADIXI; - --k; - } - Z[k] = zk; - - int e = EX + EY; - /* Is there a carry beyond the most significant digit? */ - if (Z[1] == 0) - { - for (i = 1; i <= p2; i++) - Z[i] = Z[i + 1]; - e--; - } - - EZ = e; - Z[0] = X[0] * Y[0]; -} - -/* Square *X and store result in *Y. X and Y may not overlap. For P in - [1, 2, 3], the exact result is truncated to P digits. In case P > 3 the - error is bounded by 1.001 ULP. This is a faster special case of - multiplication. */ -void -__sqr (const mp_no *x, mp_no *y, int p) -{ - long i, j, k, ip; - double u, yk; - - /* Is z=0? */ - if (__glibc_unlikely (X[0] == 0)) - { - Y[0] = 0; - return; - } - - /* We need not iterate through all X's since it's pointless to - multiply zeroes. */ - for (ip = p; ip > 0; ip--) - if (X[ip] != 0) - break; - - k = (__glibc_unlikely (p < 3)) ? p + p : p + 3; - - while (k > 2 * ip + 1) - Y[k--] = 0; - - yk = 0; - - while (k > p) - { - double yk2 = 0.0; - long lim = k / 2; - - if (k % 2 == 0) - { - yk += X[lim] * X[lim]; - lim--; - } - - /* In __mul, this loop (and the one within the next while loop) run - between a range to calculate the mantissa as follows: - - Z[k] = X[k] * Y[n] + X[k+1] * Y[n-1] ... + X[n-1] * Y[k+1] - + X[n] * Y[k] - - For X == Y, we can get away with summing halfway and doubling the - result. For cases where the range size is even, the mid-point needs - to be added separately (above). */ - for (i = k - p, j = p; i <= lim; i++, j--) - yk2 += X[i] * X[j]; - - yk += 2.0 * yk2; - - u = (yk + CUTTER) - CUTTER; - if (u > yk) - u -= RADIX; - Y[k--] = yk - u; - yk = u * RADIXI; - } - - while (k > 1) - { - double yk2 = 0.0; - long lim = k / 2; - - if (k % 2 == 0) - { - yk += X[lim] * X[lim]; - lim--; - } - - /* Likewise for this loop. */ - for (i = 1, j = k - 1; i <= lim; i++, j--) - yk2 += X[i] * X[j]; - - yk += 2.0 * yk2; - - u = (yk + CUTTER) - CUTTER; - if (u > yk) - u -= RADIX; - Y[k--] = yk - u; - yk = u * RADIXI; - } - Y[k] = yk; - - /* Squares are always positive. */ - Y[0] = 1.0; - - int e = EX * 2; - /* Is there a carry beyond the most significant digit? */ - if (__glibc_unlikely (Y[1] == 0)) - { - for (i = 1; i <= p; i++) - Y[i] = Y[i + 1]; - e--; - } - EY = e; -} |