aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2017-03-21 10:59:31 -0700
committerH.J. Lu <hjl.tools@gmail.com>2017-04-07 10:06:58 -0700
commit3966298a45782a73739ea31d76ee96b5c1a2788f (patch)
treedb420145876e45b6a7e8ab4c23959095c4bdeecc
parentcc5dcd88039269bfaeefc0f5b2cf675904f7ee33 (diff)
downloadglibc-3966298a45782a73739ea31d76ee96b5c1a2788f.tar
glibc-3966298a45782a73739ea31d76ee96b5c1a2788f.tar.gz
glibc-3966298a45782a73739ea31d76ee96b5c1a2788f.tar.bz2
glibc-3966298a45782a73739ea31d76ee96b5c1a2788f.zip
x86-64: Improve branch predication in _dl_runtime_resolve_avx512_opt [BZ #21258]
On Skylake server, _dl_runtime_resolve_avx512_opt is used to preserve the first 8 vector registers. The code layout is if only %xmm0 - %xmm7 registers are used preserve %xmm0 - %xmm7 registers if only %ymm0 - %ymm7 registers are used preserve %ymm0 - %ymm7 registers preserve %zmm0 - %zmm7 registers Branch predication always executes the fallthrough code path to preserve %zmm0 - %zmm7 registers speculatively, even though only %xmm0 - %xmm7 registers are used. This leads to lower CPU frequency on Skylake server. This patch changes the fallthrough code path to preserve %xmm0 - %xmm7 registers instead: if whole %zmm0 - %zmm7 registers are used preserve %zmm0 - %zmm7 registers if only %ymm0 - %ymm7 registers are used preserve %ymm0 - %ymm7 registers preserve %xmm0 - %xmm7 registers Tested on Skylake server. [BZ #21258] * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve_opt): Define only if _dl_runtime_resolve is defined to _dl_runtime_resolve_sse_vex. * sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_opt): Fallthrough to _dl_runtime_resolve_sse_vex. (cherry picked from commit c15f8eb50cea7ad1a4ccece6e0982bf426d52c00)
-rw-r--r--ChangeLog9
-rw-r--r--sysdeps/x86_64/dl-trampoline.S3
-rw-r--r--sysdeps/x86_64/dl-trampoline.h9
3 files changed, 15 insertions, 6 deletions
diff --git a/ChangeLog b/ChangeLog
index 9047f652fe..6e4696c7df 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2017-04-07 H.J. Lu <hongjiu.lu@intel.com>
+
+ [BZ #21258]
+ * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve_opt):
+ Define only if _dl_runtime_resolve is defined to
+ _dl_runtime_resolve_sse_vex.
+ * sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_opt):
+ Fallthrough to _dl_runtime_resolve_sse_vex.
+
2017-04-03 Mike Frysinger <vapier@gentoo.org>
[BZ #21253]
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 39f595e1e1..50b23633e3 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -87,11 +87,9 @@
#endif
#define VEC(i) zmm##i
#define _dl_runtime_resolve _dl_runtime_resolve_avx512
-#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt
#define _dl_runtime_profile _dl_runtime_profile_avx512
#include "dl-trampoline.h"
#undef _dl_runtime_resolve
-#undef _dl_runtime_resolve_opt
#undef _dl_runtime_profile
#undef VEC
#undef VMOV
@@ -145,4 +143,5 @@
# define VMOV vmovdqu
#endif
#define _dl_runtime_resolve _dl_runtime_resolve_sse_vex
+#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt
#include "dl-trampoline.h"
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index abe4471c1d..32ad3af202 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -129,19 +129,20 @@ _dl_runtime_resolve_opt:
# YMM state isn't in use.
PRESERVE_BND_REGS_PREFIX
jz _dl_runtime_resolve_sse_vex
-# elif VEC_SIZE == 64
+# elif VEC_SIZE == 16
# For ZMM registers, check if YMM state and ZMM state are in
# use.
andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d
cmpl $bit_YMM_state, %r11d
- # Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
- # neither YMM state nor ZMM state are in use.
+ # Preserve %zmm0 - %zmm7 registers if ZMM state is in use.
PRESERVE_BND_REGS_PREFIX
- jl _dl_runtime_resolve_sse_vex
+ jg _dl_runtime_resolve_avx512
# Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if
# ZMM state isn't in use.
PRESERVE_BND_REGS_PREFIX
je _dl_runtime_resolve_avx
+ # Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
+ # neither YMM state nor ZMM state are in use.
# else
# error Unsupported VEC_SIZE!
# endif