x86-64: Compile branred.c with -mprefer-vector-width=128 [BZ #24603]

When compiled with -O3 and AVX, GCC 8 and 9 optimize some loops in sysdeps/ieee754/dbl-64/branred.c with 256-bit vector instructions, which leads to store forward stall: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90579 There is no easy fix in compiler. This patch limits vector width to 128 bits to work around this issue. It improves performance of sin and cos by more than 40% on Skylake compiled with -O3 -march=skylake. Tested with GCC 7/8/9 on x86-64. [BZ #24603] * sysdeps/x86_64/configure.ac: Check if -mprefer-vector-width=128 works. * sysdeps/x86_64/configure: Regenerated. * sysdeps/x86_64/fpu/Makefile (CFLAGS-branred.c): New. Set to -mprefer-vector-width=128 if supported.
author: H.J. Lu <hjl.tools@gmail.com> 2019-07-24 14:48:33 -0700
committer: H.J. Lu <hjl.tools@gmail.com> 2019-07-24 14:48:43 -0700
commit: 7e681561a3aea7aa8f21fb031a7c778147dfdf5b (patch)
tree: 9d70b934aeae381ec82fa7b21481728bbf0ad59a
parent: 82c664ed751f52a3074a9d6d366e87086f10b2f4 (diff)
download: glibc-7e681561a3aea7aa8f21fb031a7c778147dfdf5b.tar
glibc-7e681561a3aea7aa8f21fb031a7c778147dfdf5b.tar.gz
glibc-7e681561a3aea7aa8f21fb031a7c778147dfdf5b.tar.bz2
glibc-7e681561a3aea7aa8f21fb031a7c778147dfdf5b.zip
4 files changed, 52 insertions, 0 deletions
diff --git a/ChangeLog b/ChangeLog
index 88108d1e8b..31a6b38bd5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2019-07-24  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #24603]
+	* sysdeps/x86_64/configure.ac: Check if -mprefer-vector-width=128
+	works.
+	* sysdeps/x86_64/configure: Regenerated.
+	* sysdeps/x86_64/fpu/Makefile (CFLAGS-branred.c): New.  Set
+	to -mprefer-vector-width=128 if supported.
+
 2019-07-24  Florian Weimer  <fweimer@redhat.com>
 
 	* scripts/build-many-glibcs.py (Context.checkout): Default to
diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure
index 8674d14569..84f82c2406 100644
--- a/sysdeps/x86_64/configure
+++ b/sysdeps/x86_64/configure
@@ -54,6 +54,28 @@ fi
 config_vars="$config_vars
 config-cflags-avx512 = $libc_cv_cc_avx512"
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking -mprefer-vector-width=128" >&5
+$as_echo_n "checking -mprefer-vector-width=128... " >&6; }
+if ${libc_cv_cc_mprefer_vector_width+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if { ac_try='${CC-cc} -mprefer-vector-width=128 -xc /dev/null -S -o /dev/null'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then :
+  libc_cv_cc_mprefer_vector_width=yes
+else
+  libc_cv_cc_mprefer_vector_width=no
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_cc_mprefer_vector_width" >&5
+$as_echo "$libc_cv_cc_mprefer_vector_width" >&6; }
+config_vars="$config_vars
+config-cflags-mprefer-vector-width = $libc_cv_cc_mprefer_vector_width"
+
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for Intel MPX support" >&5
 $as_echo_n "checking for Intel MPX support... " >&6; }
 if ${libc_cv_asm_mpx+:} false; then :
diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac
index b7d2c0124f..cdaba0c075 100644
--- a/sysdeps/x86_64/configure.ac
+++ b/sysdeps/x86_64/configure.ac
@@ -25,6 +25,15 @@ if test $libc_cv_cc_avx512 = yes; then
 fi
 LIBC_CONFIG_VAR([config-cflags-avx512], [$libc_cv_cc_avx512])
 
+dnl Check if -mprefer-vector-width=128 works.
+AC_CACHE_CHECK(-mprefer-vector-width=128, libc_cv_cc_mprefer_vector_width, [dnl
+LIBC_TRY_CC_OPTION([-mprefer-vector-width=128],
+		   [libc_cv_cc_mprefer_vector_width=yes],
+		   [libc_cv_cc_mprefer_vector_width=no])
+])
+LIBC_CONFIG_VAR([config-cflags-mprefer-vector-width],
+		[$libc_cv_cc_mprefer_vector_width])
+
 dnl Check whether asm supports Intel MPX
 AC_CACHE_CHECK(for Intel MPX support, libc_cv_asm_mpx, [dnl
 cat > conftest.s <<\EOF
diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile
index 2b7d69bb50..74b14ba096 100644
--- a/sysdeps/x86_64/fpu/Makefile
+++ b/sysdeps/x86_64/fpu/Makefile
@@ -237,3 +237,15 @@ CFLAGS-test-float-libmvec-sincosf-avx512.c = -DREQUIRE_AVX512F
 CFLAGS-test-float-libmvec-sincosf-avx512-main.c = $(libmvec-sincos-cflags) $(float-vlen16-arch-ext-cflags)
 endif
 endif
+
+ifeq ($(subdir)$(config-cflags-mprefer-vector-width),mathyes)
+# When compiled with -O3 -march=skylake, GCC 8 and 9 optimize some loops
+# in branred.c with 256-bit vector instructions, which leads to store
+# forward stall:
+#
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90579
+#
+# Limit vector width to 128 bits to work around this issue.  It improves
+# performance of sin and cos by more than 40% on Skylake.
+CFLAGS-branred.c = -mprefer-vector-width=128
+endif
author	H.J. Lu <hjl.tools@gmail.com>	2019-07-24 14:48:33 -0700
committer	H.J. Lu <hjl.tools@gmail.com>	2019-07-24 14:48:43 -0700
commit	7e681561a3aea7aa8f21fb031a7c778147dfdf5b (patch)
tree	9d70b934aeae381ec82fa7b21481728bbf0ad59a
parent	82c664ed751f52a3074a9d6d366e87086f10b2f4 (diff)
download	glibc-7e681561a3aea7aa8f21fb031a7c778147dfdf5b.tar glibc-7e681561a3aea7aa8f21fb031a7c778147dfdf5b.tar.gz glibc-7e681561a3aea7aa8f21fb031a7c778147dfdf5b.tar.bz2 glibc-7e681561a3aea7aa8f21fb031a7c778147dfdf5b.zip