38 files changed, 910 insertions, 417 deletions
diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h
index 282805e396..e86d8b5b63 100644
--- a/sysdeps/aarch64/dl-machine.h
+++ b/sysdeps/aarch64/dl-machine.h
@@ -172,8 +172,8 @@ _dl_start_user:							\n\
 	cmp	x0, #0						\n\
 	bne	1b						\n\
 	// Update _dl_argv					\n\
-	adrp	x3, _dl_argv					\n\
-	str	x2, [x3, #:lo12:_dl_argv]			\n\
+	adrp	x3, __GI__dl_argv				\n\
+	str	x2, [x3, #:lo12:__GI__dl_argv]			\n\
 .L_done_stack_adjust:						\n\
 	// compute envp						\n\
 	add	x3, x2, x1, lsl #3				\n\
diff --git a/sysdeps/gnu/glob64.c b/sysdeps/gnu/glob64.c
index d1e4e6f0d5..52e97e2f6a 100644
--- a/sysdeps/gnu/glob64.c
+++ b/sysdeps/gnu/glob64.c
@@ -15,11 +15,8 @@
 #undef __stat
 #define __stat(file, buf) __xstat64 (_STAT_VER, file, buf)
 
-#define NO_GLOB_PATTERN_P 1
-
 #define COMPILE_GLOB64	1
 
 #include <posix/glob.c>
 
 libc_hidden_def (glob64)
-libc_hidden_def (globfree64)
diff --git a/sysdeps/gnu/globfree64.c b/sysdeps/gnu/globfree64.c
new file mode 100644
index 0000000000..f092d0bf8b
--- /dev/null
+++ b/sysdeps/gnu/globfree64.c
@@ -0,0 +1,10 @@
+#include <dirent.h>
+#include <glob.h>
+#include <sys/stat.h>
+
+#define glob_t glob64_t
+#define globfree(pglob) globfree64 (pglob)
+
+#include <posix/globfree.c>
+
+libc_hidden_def (globfree64)
diff --git a/sysdeps/ieee754/dbl-64/e_pow.c b/sysdeps/ieee754/dbl-64/e_pow.c
index 663fa392c2..bd758b5979 100644
--- a/sysdeps/ieee754/dbl-64/e_pow.c
+++ b/sysdeps/ieee754/dbl-64/e_pow.c
@@ -466,15 +466,15 @@ checkint (double x)
     return (n & 1) ? -1 : 1;	/* odd or even */
   if (k > 20)
     {
-      if (n << (k - 20))
+      if (n << (k - 20) != 0)
 	return 0;		/* if not integer */
-      return (n << (k - 21)) ? -1 : 1;
+      return (n << (k - 21) != 0) ? -1 : 1;
     }
   if (n)
     return 0;			/*if  not integer */
   if (k == 20)
     return (m & 1) ? -1 : 1;
-  if (m << (k + 12))
+  if (m << (k + 12) != 0)
     return 0;
-  return (m << (k + 11)) ? -1 : 1;
+  return (m << (k + 11) != 0) ? -1 : 1;
 }
diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
index 35e1ed48d2..32beaa67d0 100644
--- a/sysdeps/unix/sysv/linux/Makefile
+++ b/sysdeps/unix/sysv/linux/Makefile
@@ -140,7 +140,7 @@ endif
 ifeq ($(subdir),posix)
 sysdep_headers += bits/initspin.h
 
-sysdep_routines += sched_getcpu
+sysdep_routines += sched_getcpu oldglob
 
 tests += tst-affinity tst-affinity-pid
 
diff --git a/sysdeps/unix/sysv/linux/alpha/glob.c b/sysdeps/unix/sysv/linux/alpha/glob.c
index c5dfb85468..19eb9b1c07 100644
--- a/sysdeps/unix/sysv/linux/alpha/glob.c
+++ b/sysdeps/unix/sysv/linux/alpha/glob.c
@@ -42,10 +42,6 @@ extern void __new_globfree (glob_t *__pglob);
 #undef globfree64
 
 versioned_symbol (libc, __new_glob, glob, GLIBC_2_1);
-versioned_symbol (libc, __new_globfree, globfree, GLIBC_2_1);
 libc_hidden_ver (__new_glob, glob)
-libc_hidden_ver (__new_globfree, globfree)
 
 weak_alias (__new_glob, glob64)
-weak_alias (__new_globfree, globfree64)
-libc_hidden_ver (__new_globfree, globfree64)
diff --git a/sysdeps/unix/sysv/linux/alpha/globfree.c b/sysdeps/unix/sysv/linux/alpha/globfree.c
new file mode 100644
index 0000000000..98cf1c200b
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/alpha/globfree.c
@@ -0,0 +1,37 @@
+/* Compat globfree.  Linux/alpha version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define globfree64 __no_globfree64_decl
+#include <sys/types.h>
+#include <glob.h>
+#include <shlib-compat.h>
+
+#define globfree(pglob) \
+  __new_globfree (pglob)
+
+extern void __new_globfree (glob_t *__pglob);
+
+#include <posix/globfree.c>
+
+#undef globfree64
+
+versioned_symbol (libc, __new_globfree, globfree, GLIBC_2_1);
+libc_hidden_ver (__new_globfree, globfree)
+
+weak_alias (__new_globfree, globfree64)
+libc_hidden_ver (__new_globfree, globfree64)
diff --git a/sysdeps/unix/sysv/linux/i386/glob64.c b/sysdeps/unix/sysv/linux/i386/glob64.c
index 802c957d6c..c2cc85741f 100644
--- a/sysdeps/unix/sysv/linux/i386/glob64.c
+++ b/sysdeps/unix/sysv/linux/i386/glob64.c
@@ -19,6 +19,7 @@
 #include <dirent.h>
 #include <glob.h>
 #include <sys/stat.h>
+#include <shlib-compat.h>
 
 #define dirent dirent64
 #define __readdir(dirp) __readdir64 (dirp)
@@ -33,44 +34,9 @@
 #undef __stat
 #define __stat(file, buf) __xstat64 (_STAT_VER, file, buf)
 
-#define NO_GLOB_PATTERN_P 1
-
 #define COMPILE_GLOB64	1
 
 #include <posix/glob.c>
 
-#include "shlib-compat.h"
-
-libc_hidden_def (globfree64)
-
 versioned_symbol (libc, __glob64, glob64, GLIBC_2_2);
 libc_hidden_ver (__glob64, glob64)
-
-#if SHLIB_COMPAT(libc, GLIBC_2_1, GLIBC_2_2)
-
-#include <sysdeps/unix/sysv/linux/i386/olddirent.h>
-
-int __old_glob64 (const char *__pattern, int __flags,
-		  int (*__errfunc) (const char *, int),
-		  glob64_t *__pglob);
-
-#undef dirent
-#define dirent __old_dirent64
-#undef GL_READDIR
-# define GL_READDIR(pglob, stream) \
-  ((struct __old_dirent64 *) (pglob)->gl_readdir (stream))
-#undef __readdir
-#define __readdir(dirp) __old_readdir64 (dirp)
-#undef glob
-#define glob(pattern, flags, errfunc, pglob) \
-  __old_glob64 (pattern, flags, errfunc, pglob)
-#define convert_dirent __old_convert_dirent
-#define glob_in_dir __old_glob_in_dir
-#define GLOB_ATTRIBUTE attribute_compat_text_section
-
-#define GLOB_ONLY_P 1
-
-#include <posix/glob.c>
-
-compat_symbol (libc, __old_glob64, glob64, GLIBC_2_1);
-#endif
diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n64/globfree64.c b/sysdeps/unix/sysv/linux/mips/mips64/n64/globfree64.c
new file mode 100644
index 0000000000..abc35fdd2b
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/mips/mips64/n64/globfree64.c
@@ -0,0 +1 @@
+/* glob64 is in globfree64.c */
diff --git a/sysdeps/unix/sysv/linux/oldglob.c b/sysdeps/unix/sysv/linux/oldglob.c
new file mode 100644
index 0000000000..8233e57ce9
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/oldglob.c
@@ -0,0 +1,42 @@
+#include <shlib-compat.h>
+
+#if SHLIB_COMPAT(libc, GLIBC_2_1, GLIBC_2_2)
+
+#include <dirent.h>
+#include <glob.h>
+#include <sys/stat.h>
+
+#include <sysdeps/unix/sysv/linux/i386/olddirent.h>
+
+int __old_glob64 (const char *__pattern, int __flags,
+		  int (*__errfunc) (const char *, int),
+		  glob64_t *__pglob);
+libc_hidden_proto (__old_glob64);
+
+#define dirent __old_dirent64
+#define GL_READDIR(pglob, stream) \
+  ((struct __old_dirent64 *) (pglob)->gl_readdir (stream))
+#undef __readdir
+#define __readdir(dirp) __old_readdir64 (dirp)
+
+#define glob_t glob64_t
+#define glob(pattern, flags, errfunc, pglob) \
+  __old_glob64 (pattern, flags, errfunc, pglob)
+#define globfree(pglob) globfree64(pglob)
+
+#define convert_dirent __old_convert_dirent
+#define glob_in_dir __old_glob_in_dir
+
+#undef stat
+#define stat stat64
+#undef __stat
+#define __stat(file, buf) __xstat64 (_STAT_VER, file, buf)
+
+#define GLOB_ATTRIBUTE attribute_compat_text_section
+
+#include <posix/glob.c>
+
+libc_hidden_def (__old_glob64);
+
+compat_symbol (libc, __old_glob64, glob64, GLIBC_2_1);
+#endif
diff --git a/sysdeps/unix/sysv/linux/sh/sh3/ucontext_i.sym b/sysdeps/unix/sysv/linux/sh/sh3/ucontext_i.sym
index 17397c5511..25f914a93b 100644
--- a/sysdeps/unix/sysv/linux/sh/sh3/ucontext_i.sym
+++ b/sysdeps/unix/sysv/linux/sh/sh3/ucontext_i.sym
@@ -13,22 +13,22 @@ SIG_SETMASK
 oLINK		ucontext (uc_link)
 oSS_SP		ucontext (uc_stack.ss_sp)
 oSS_SIZE	ucontext (uc_stack.ss_size)
-oR0		mcontext (gregs[R0])
-oR1		mcontext (gregs[R1])
-oR2		mcontext (gregs[R2])
-oR3		mcontext (gregs[R3])
-oR4		mcontext (gregs[R4])
-oR5		mcontext (gregs[R5])
-oR6		mcontext (gregs[R6])
-oR7		mcontext (gregs[R7])
-oR8		mcontext (gregs[R8])
-oR9		mcontext (gregs[R9])
-oR10		mcontext (gregs[R10])
-oR11		mcontext (gregs[R11])
-oR12		mcontext (gregs[R12])
-oR13		mcontext (gregs[R13])
-oR14		mcontext (gregs[R14])
-oR15		mcontext (gregs[R15])
+oR0		mcontext (gregs[REG_R0])
+oR1		mcontext (gregs[REG_R1])
+oR2		mcontext (gregs[REG_R2])
+oR3		mcontext (gregs[REG_R3])
+oR4		mcontext (gregs[REG_R4])
+oR5		mcontext (gregs[REG_R5])
+oR6		mcontext (gregs[REG_R6])
+oR7		mcontext (gregs[REG_R7])
+oR8		mcontext (gregs[REG_R8])
+oR9		mcontext (gregs[REG_R9])
+oR10		mcontext (gregs[REG_R10])
+oR11		mcontext (gregs[REG_R11])
+oR12		mcontext (gregs[REG_R12])
+oR13		mcontext (gregs[REG_R13])
+oR14		mcontext (gregs[REG_R14])
+oR15		mcontext (gregs[REG_R15])
 oPC		mcontext (pc)
 oPR		mcontext (pr)
 oSR		mcontext (sr)
diff --git a/sysdeps/unix/sysv/linux/sh/sh4/ucontext_i.sym b/sysdeps/unix/sysv/linux/sh/sh4/ucontext_i.sym
index 65633fbcf4..130f60cd96 100644
--- a/sysdeps/unix/sysv/linux/sh/sh4/ucontext_i.sym
+++ b/sysdeps/unix/sysv/linux/sh/sh4/ucontext_i.sym
@@ -13,22 +13,22 @@ SIG_SETMASK
 oLINK		ucontext (uc_link)
 oSS_SP		ucontext (uc_stack.ss_sp)
 oSS_SIZE	ucontext (uc_stack.ss_size)
-oR0		mcontext (gregs[R0])
-oR1		mcontext (gregs[R1])
-oR2		mcontext (gregs[R2])
-oR3		mcontext (gregs[R3])
-oR4		mcontext (gregs[R4])
-oR5		mcontext (gregs[R5])
-oR6		mcontext (gregs[R6])
-oR7		mcontext (gregs[R7])
-oR8		mcontext (gregs[R8])
-oR9		mcontext (gregs[R9])
-oR10		mcontext (gregs[R10])
-oR11		mcontext (gregs[R11])
-oR12		mcontext (gregs[R12])
-oR13		mcontext (gregs[R13])
-oR14		mcontext (gregs[R14])
-oR15		mcontext (gregs[R15])
+oR0		mcontext (gregs[REG_R0])
+oR1		mcontext (gregs[REG_R1])
+oR2		mcontext (gregs[REG_R2])
+oR3		mcontext (gregs[REG_R3])
+oR4		mcontext (gregs[REG_R4])
+oR5		mcontext (gregs[REG_R5])
+oR6		mcontext (gregs[REG_R6])
+oR7		mcontext (gregs[REG_R7])
+oR8		mcontext (gregs[REG_R8])
+oR9		mcontext (gregs[REG_R9])
+oR10		mcontext (gregs[REG_R10])
+oR11		mcontext (gregs[REG_R11])
+oR12		mcontext (gregs[REG_R12])
+oR13		mcontext (gregs[REG_R13])
+oR14		mcontext (gregs[REG_R14])
+oR15		mcontext (gregs[REG_R15])
 oPC		mcontext (pc)
 oPR		mcontext (pr)
 oSR		mcontext (sr)
diff --git a/sysdeps/unix/sysv/linux/sh/sys/ucontext.h b/sysdeps/unix/sysv/linux/sh/sys/ucontext.h
index ab9a7e66bf..037fbb73e8 100644
--- a/sysdeps/unix/sysv/linux/sh/sys/ucontext.h
+++ b/sysdeps/unix/sysv/linux/sh/sys/ucontext.h
@@ -31,49 +31,47 @@
 typedef int greg_t;
 
 /* Number of general registers.  */
-#define NGPREG	16
+#define NGREG	16
 
 /* Container for all general registers.  */
-typedef greg_t gregset_t[NGPREG];
+typedef greg_t gregset_t[NGREG];
 
-#ifdef __USE_GNU
 /* Number of each register is the `gregset_t' array.  */
 enum
 {
-  R0 = 0,
-#define R0	R0
-  R1 = 1,
-#define R1	R1
-  R2 = 2,
-#define R2	R2
-  R3 = 3,
-#define R3	R3
-  R4 = 4,
-#define R4	R4
-  R5 = 5,
-#define R5	R5
-  R6 = 6,
-#define R6	R6
-  R7 = 7,
-#define R7	R7
-  R8 = 8,
-#define R8	R8
-  R9 = 9,
-#define R9	R9
-  R10 = 10,
-#define R10	R10
-  R11 = 11,
-#define R11	R11
-  R12 = 12,
-#define R12	R12
-  R13 = 13,
-#define R13	R13
-  R14 = 14,
-#define R14	R14
-  R15 = 15,
-#define R15	R15
+  REG_R0 = 0,
+#define REG_R0	REG_R0
+  REG_R1 = 1,
+#define REG_R1	REG_R1
+  REG_R2 = 2,
+#define REG_R2	REG_R2
+  REG_R3 = 3,
+#define REG_R3	REG_R3
+  REG_R4 = 4,
+#define REG_R4	REG_R4
+  REG_R5 = 5,
+#define REG_R5	REG_R5
+  REG_R6 = 6,
+#define REG_R6	REG_R6
+  REG_R7 = 7,
+#define REG_R7	REG_R7
+  REG_R8 = 8,
+#define REG_R8	REG_R8
+  REG_R9 = 9,
+#define REG_R9	REG_R9
+  REG_R10 = 10,
+#define REG_R10	REG_R10
+  REG_R11 = 11,
+#define REG_R11	REG_R11
+  REG_R12 = 12,
+#define REG_R12	REG_R12
+  REG_R13 = 13,
+#define REG_R13	REG_R13
+  REG_R14 = 14,
+#define REG_R14	REG_R14
+  REG_R15 = 15,
+#define REG_R15	REG_R15
 };
-#endif
 
 #if (defined(__SH4__) || defined(__SH4A__))
 typedef int freg_t;
diff --git a/sysdeps/unix/sysv/linux/wordsize-64/globfree64.c b/sysdeps/unix/sysv/linux/wordsize-64/globfree64.c
new file mode 100644
index 0000000000..af035e1514
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/wordsize-64/globfree64.c
@@ -0,0 +1,2 @@
+/* This file is here so sysdeps/gnu/glob64.c doesn't take precedence.  */
+#include <sysdeps/wordsize-64/globfree64.c>
diff --git a/sysdeps/unix/sysv/linux/x86_64/x32/globfree.c b/sysdeps/unix/sysv/linux/x86_64/x32/globfree.c
new file mode 100644
index 0000000000..b76a761c17
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/x86_64/x32/globfree.c
@@ -0,0 +1 @@
+#include <sysdeps/wordsize-64/globfree.c>
diff --git a/sysdeps/wordsize-64/glob.c b/sysdeps/wordsize-64/glob.c
index 082faf1c70..954e8d37e2 100644
--- a/sysdeps/wordsize-64/glob.c
+++ b/sysdeps/wordsize-64/glob.c
@@ -4,5 +4,3 @@
 #undef glob64
 #undef globfree64
 weak_alias (glob, glob64)
-weak_alias (globfree, globfree64)
-libc_hidden_ver (globfree, globfree64)
diff --git a/sysdeps/wordsize-64/globfree.c b/sysdeps/wordsize-64/globfree.c
new file mode 100644
index 0000000000..ec8c35b489
--- /dev/null
+++ b/sysdeps/wordsize-64/globfree.c
@@ -0,0 +1,5 @@
+#define globfree64 __no_globfree64_decl
+#include <posix/globfree.c>
+#undef globfree64
+weak_alias (globfree, globfree64)
+libc_hidden_ver (globfree, globfree64)
diff --git a/sysdeps/wordsize-64/globfree64.c b/sysdeps/wordsize-64/globfree64.c
new file mode 100644
index 0000000000..a0f57ff4b3
--- /dev/null
+++ b/sysdeps/wordsize-64/globfree64.c
@@ -0,0 +1 @@
+/* globfree64 is in globfree.c */
diff --git a/sysdeps/x86/cpu-features-offsets.sym b/sysdeps/x86/cpu-features-offsets.sym
index f6739fae81..33dd094e37 100644
--- a/sysdeps/x86/cpu-features-offsets.sym
+++ b/sysdeps/x86/cpu-features-offsets.sym
@@ -15,6 +15,7 @@ CPUID_ECX_OFFSET	offsetof (struct cpuid_registers, ecx)
 CPUID_EDX_OFFSET	offsetof (struct cpuid_registers, edx)
 FAMILY_OFFSET		offsetof (struct cpu_features, family)
 MODEL_OFFSET		offsetof (struct cpu_features, model)
+XSAVE_STATE_SIZE_OFFSET	offsetof (struct cpu_features, xsave_state_size)
 FEATURE_OFFSET		offsetof (struct cpu_features, feature)
 FEATURE_SIZE		sizeof (unsigned int)
 
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index d1ee922290..9eca98817d 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -18,6 +18,7 @@
 
 #include <cpuid.h>
 #include <cpu-features.h>
+#include <libc-internal.h>
 
 static void
 get_common_indeces (struct cpu_features *cpu_features,
@@ -88,6 +89,71 @@ get_common_indeces (struct cpu_features *cpu_features,
 	    cpu_features->feature[index_arch_FMA_Usable]
 	      |= bit_arch_FMA_Usable;
 	}
+
+      /* For _dl_runtime_resolve, set xsave_state_size to xsave area
+	 size + integer register save size and align it to 64 bytes.  */
+      if (cpu_features->max_cpuid >= 0xd)
+	{
+	  unsigned int eax, ebx, ecx, edx;
+
+	  __cpuid_count (0xd, 0, eax, ebx, ecx, edx);
+	  if (ebx != 0)
+	    {
+	      cpu_features->xsave_state_size
+		= ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64);
+
+	      __cpuid_count (0xd, 1, eax, ebx, ecx, edx);
+
+	      /* Check if XSAVEC is available.  */
+	      if ((eax & (1 << 1)) != 0)
+		{
+		  unsigned int xstate_comp_offsets[32];
+		  unsigned int xstate_comp_sizes[32];
+		  unsigned int i;
+
+		  xstate_comp_offsets[0] = 0;
+		  xstate_comp_offsets[1] = 160;
+		  xstate_comp_offsets[2] = 576;
+		  xstate_comp_sizes[0] = 160;
+		  xstate_comp_sizes[1] = 256;
+
+		  for (i = 2; i < 32; i++)
+		    {
+		      if ((STATE_SAVE_MASK & (1 << i)) != 0)
+			{
+			  __cpuid_count (0xd, i, eax, ebx, ecx, edx);
+			  xstate_comp_sizes[i] = eax;
+			}
+		      else
+			{
+			  ecx = 0;
+			  xstate_comp_sizes[i] = 0;
+			}
+
+		      if (i > 2)
+			{
+			  xstate_comp_offsets[i]
+			    = (xstate_comp_offsets[i - 1]
+			       + xstate_comp_sizes[i -1]);
+			  if ((ecx & (1 << 1)) != 0)
+			    xstate_comp_offsets[i]
+			      = ALIGN_UP (xstate_comp_offsets[i], 64);
+			}
+		    }
+
+		  /* Use XSAVEC.  */
+		  unsigned int size
+		    = xstate_comp_offsets[31] + xstate_comp_sizes[31];
+		  if (size)
+		    {
+		      cpu_features->xsave_state_size
+			= ALIGN_UP (size + STATE_SAVE_OFFSET, 64);
+		      cpu_features->feature[index_arch_XSAVEC_Usable]
+			|= bit_arch_XSAVEC_Usable;
+		    }
+		}
+	    }
+	}
     }
 }
 
@@ -213,20 +279,6 @@ init_cpu_features (struct cpu_features *cpu_features)
       else
 	cpu_features->feature[index_arch_Prefer_No_AVX512]
 	  |= bit_arch_Prefer_No_AVX512;
-
-      /* To avoid SSE transition penalty, use _dl_runtime_resolve_slow.
-         If XGETBV suports ECX == 1, use _dl_runtime_resolve_opt.  */
-      cpu_features->feature[index_arch_Use_dl_runtime_resolve_slow]
-	|= bit_arch_Use_dl_runtime_resolve_slow;
-      if (cpu_features->max_cpuid >= 0xd)
-	{
-	  unsigned int eax;
-
-	  __cpuid_count (0xd, 1, eax, ebx, ecx, edx);
-	  if ((eax & (1 << 2)) != 0)
-	    cpu_features->feature[index_arch_Use_dl_runtime_resolve_opt]
-	      |= bit_arch_Use_dl_runtime_resolve_opt;
-	}
     }
   /* This spells out "AuthenticAMD".  */
   else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index 2609ac0999..507a141414 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -37,9 +37,8 @@
 #define bit_arch_Prefer_No_VZEROUPPER		(1 << 17)
 #define bit_arch_Fast_Unaligned_Copy		(1 << 18)
 #define bit_arch_Prefer_ERMS			(1 << 19)
-#define bit_arch_Use_dl_runtime_resolve_opt	(1 << 20)
-#define bit_arch_Use_dl_runtime_resolve_slow	(1 << 21)
-#define bit_arch_Prefer_No_AVX512		(1 << 22)
+#define bit_arch_Prefer_No_AVX512		(1 << 20)
+#define bit_arch_XSAVEC_Usable			(1 << 21)
 
 /* CPUID Feature flags.  */
 
@@ -82,6 +81,15 @@
 /* The current maximum size of the feature integer bit array.  */
 #define FEATURE_INDEX_MAX 1
 
+/* Offset for fxsave/xsave area used by _dl_runtime_resolve.  Also need
+   space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX.  It must be
+   aligned to 16 bytes for fxsave and 64 bytes for xsave.  */
+#define STATE_SAVE_OFFSET (8 * 7 + 8)
+
+/* Save SSE, AVX, AVX512, mask and bound registers.  */
+#define STATE_SAVE_MASK \
+  ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
+
 #ifdef	__ASSEMBLER__
 
 # include <cpu-features-offsets.h>
@@ -206,6 +214,12 @@ struct cpu_features
   } cpuid[COMMON_CPUID_INDEX_MAX];
   unsigned int family;
   unsigned int model;
+  /* The type must be unsigned long int so that we use
+
+	sub xsave_state_size_offset(%rip) %RSP_LP
+
+     in _dl_runtime_resolve.  */
+  unsigned long int xsave_state_size;
   unsigned int feature[FEATURE_INDEX_MAX];
 };
 
@@ -298,9 +312,8 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1
 # define index_arch_Fast_Unaligned_Copy	FEATURE_INDEX_1
 # define index_arch_Prefer_ERMS		FEATURE_INDEX_1
-# define index_arch_Use_dl_runtime_resolve_opt FEATURE_INDEX_1
-# define index_arch_Use_dl_runtime_resolve_slow FEATURE_INDEX_1
 # define index_arch_Prefer_No_AVX512	FEATURE_INDEX_1
+# define index_arch_XSAVEC_Usable	FEATURE_INDEX_1
 
 #endif	/* !__ASSEMBLER__ */
 
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index 6d99284cd0..cc990a9685 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -27,7 +27,7 @@ ifeq ($(subdir),elf)
 CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
 		   -mno-mmx)
 
-sysdep-dl-routines += tlsdesc dl-tlsdesc
+sysdep-dl-routines += tlsdesc dl-tlsdesc tls_get_addr
 
 tests += ifuncmain8
 modules-names += ifuncmod8
@@ -49,9 +49,12 @@ extra-test-objs += tst-quadmod1pie.o tst-quadmod2pie.o
 $(objpfx)tst-quad1pie: $(objpfx)tst-quadmod1pie.o
 $(objpfx)tst-quad2pie: $(objpfx)tst-quadmod2pie.o
 
-tests += tst-audit3 tst-audit4 tst-audit5 tst-audit6 tst-audit7 tst-audit10
-test-extras += tst-audit4-aux tst-audit10-aux
-extra-test-objs += tst-audit4-aux.o tst-audit10-aux.o
+tests += tst-audit3 tst-audit4 tst-audit5 tst-audit6 tst-audit7 \
+	 tst-audit10 tst-sse tst-avx tst-avx512
+test-extras += tst-audit4-aux tst-audit10-aux \
+	       tst-avx-aux tst-avx512-aux
+extra-test-objs += tst-audit4-aux.o tst-audit10-aux.o \
+		   tst-avx-aux.o tst-avx512-aux.o
 
 tests += tst-split-dynreloc
 LDFLAGS-tst-split-dynreloc = -Wl,-T,$(..)sysdeps/x86_64/tst-split-dynreloc.lds
@@ -62,7 +65,8 @@ modules-names += tst-auditmod3a tst-auditmod3b \
 		tst-auditmod5a tst-auditmod5b \
 		tst-auditmod6a tst-auditmod6b tst-auditmod6c \
 		tst-auditmod7a tst-auditmod7b \
-		tst-auditmod10a tst-auditmod10b
+		tst-auditmod10a tst-auditmod10b \
+		tst-ssemod tst-avxmod tst-avx512mod
 
 $(objpfx)tst-audit3: $(objpfx)tst-auditmod3a.so
 $(objpfx)tst-audit3.out: $(objpfx)tst-auditmod3b.so
@@ -89,6 +93,10 @@ $(objpfx)tst-audit10: $(objpfx)tst-audit10-aux.o $(objpfx)tst-auditmod10a.so
 $(objpfx)tst-audit10.out: $(objpfx)tst-auditmod10b.so
 tst-audit10-ENV = LD_AUDIT=$(objpfx)tst-auditmod10b.so
 
+$(objpfx)tst-sse: $(objpfx)tst-ssemod.so
+$(objpfx)tst-avx: $(objpfx)tst-avx-aux.o $(objpfx)tst-avxmod.so
+$(objpfx)tst-avx512: $(objpfx)tst-avx512-aux.o $(objpfx)tst-avx512mod.so
+
 AVX-CFLAGS=-mavx -mno-vzeroupper
 CFLAGS-tst-audit4-aux.c += $(AVX-CFLAGS)
 CFLAGS-tst-auditmod4a.c += $(AVX-CFLAGS)
@@ -96,14 +104,18 @@ CFLAGS-tst-auditmod4b.c += $(AVX-CFLAGS)
 CFLAGS-tst-auditmod6b.c += $(AVX-CFLAGS)
 CFLAGS-tst-auditmod6c.c += $(AVX-CFLAGS)
 CFLAGS-tst-auditmod7b.c += $(AVX-CFLAGS)
+CFLAGS-tst-avx-aux.c += $(AVX-CFLAGS)
+CFLAGS-tst-avxmod.c += $(AVX-CFLAGS)
 ifeq (yes,$(config-cflags-avx512))
 AVX512-CFLAGS = -mavx512f
 CFLAGS-tst-audit10-aux.c += $(AVX512-CFLAGS)
 CFLAGS-tst-auditmod10a.c += $(AVX512-CFLAGS)
 CFLAGS-tst-auditmod10b.c += $(AVX512-CFLAGS)
+CFLAGS-tst-avx512-aux.c += $(AVX512-CFLAGS)
+CFLAGS-tst-avx512mod.c += $(AVX512-CFLAGS)
 endif
 endif
 
 ifeq ($(subdir),csu)
-gen-as-const-headers += tlsdesc.sym
+gen-as-const-headers += tlsdesc.sym rtld-offsets.sym
 endif
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index c0f0fa16a2..8355432dfc 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -66,12 +66,9 @@ static inline int __attribute__ ((unused, always_inline))
 elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 {
   Elf64_Addr *got;
-  extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden;
-  extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden;
-  extern void _dl_runtime_resolve_avx_slow (ElfW(Word)) attribute_hidden;
-  extern void _dl_runtime_resolve_avx_opt (ElfW(Word)) attribute_hidden;
-  extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden;
-  extern void _dl_runtime_resolve_avx512_opt (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
@@ -120,29 +117,14 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	  /* This function will get called to fix up the GOT entry
 	     indicated by the offset on the stack, and then jump to
 	     the resolved address.  */
-	  if (HAS_ARCH_FEATURE (AVX512F_Usable))
-	    {
-	      if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt))
-		*(ElfW(Addr) *) (got + 2)
-		  = (ElfW(Addr)) &_dl_runtime_resolve_avx512_opt;
-	      else
-		*(ElfW(Addr) *) (got + 2)
-		  = (ElfW(Addr)) &_dl_runtime_resolve_avx512;
-	    }
-	  else if (HAS_ARCH_FEATURE (AVX_Usable))
-	    {
-	      if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt))
-		*(ElfW(Addr) *) (got + 2)
-		  = (ElfW(Addr)) &_dl_runtime_resolve_avx_opt;
-	      else if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_slow))
-		*(ElfW(Addr) *) (got + 2)
-		  = (ElfW(Addr)) &_dl_runtime_resolve_avx_slow;
-	      else
-		*(ElfW(Addr) *) (got + 2)
-		  = (ElfW(Addr)) &_dl_runtime_resolve_avx;
-	    }
+	  if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
+	    *(ElfW(Addr) *) (got + 2)
+	      = (HAS_ARCH_FEATURE (XSAVEC_Usable)
+		 ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
+		 : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
 	  else
-	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse;
+	    *(ElfW(Addr) *) (got + 2)
+	      = (ElfW(Addr)) &_dl_runtime_resolve_fxsave;
 	}
     }
 
diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c
new file mode 100644
index 0000000000..3584805c8e
--- /dev/null
+++ b/sysdeps/x86_64/dl-tls.c
@@ -0,0 +1,53 @@
+/* Thread-local storage handling in the ELF dynamic linker.  x86-64 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef SHARED
+/* Work around GCC PR58066, due to which __tls_get_addr may be called
+   with an unaligned stack.  The compat implementation is in
+   tls_get_addr-compat.S.  */
+
+# include <dl-tls.h>
+
+/* Define __tls_get_addr within elf/dl-tls.c under a different
+   name.  */
+extern __typeof__ (__tls_get_addr) ___tls_get_addr;
+
+# define __tls_get_addr ___tls_get_addr
+# include <elf/dl-tls.c>
+# undef __tls_get_addr
+
+hidden_ver (___tls_get_addr, __tls_get_addr)
+
+/* Only handle slow paths for __tls_get_addr.  */
+attribute_hidden
+void *
+__tls_get_addr_slow (GET_ADDR_ARGS)
+{
+  dtv_t *dtv = THREAD_DTV ();
+
+  if (__glibc_unlikely (dtv[0].counter != GL(dl_tls_generation)))
+    return update_get_addr (GET_ADDR_PARAM);
+
+  return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL);
+}
+#else
+
+/* No compatibility symbol needed.  */
+# include <elf/dl-tls.c>
+
+#endif
diff --git a/sysdeps/x86_64/dl-tls.h b/sysdeps/x86_64/dl-tls.h
index cf6c107f54..fa5bf6cd93 100644
--- a/sysdeps/x86_64/dl-tls.h
+++ b/sysdeps/x86_64/dl-tls.h
@@ -16,6 +16,9 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#ifndef _X86_64_DL_TLS_H
+#define _X86_64_DL_TLS_H
+
 #include <stdint.h>
 
 /* Type used for the representation of TLS information in the GOT.  */
@@ -27,3 +30,5 @@ typedef struct dl_tls_index
 
 
 extern void *__tls_get_addr (tls_index *ti);
+
+#endif /* _X86_64_DL_TLS_H */
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 50b23633e3..b4cda0f535 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -34,41 +34,24 @@
 # define DL_STACK_ALIGNMENT 8
 #endif
 
-#ifndef DL_RUNTIME_UNALIGNED_VEC_SIZE
-/* The maximum size in bytes of unaligned vector load and store in the
-   dynamic linker.  Since SSE optimized memory/string functions with
-   aligned SSE register load and store are used in the dynamic linker,
-   we must set this to 8 so that _dl_runtime_resolve_sse will align the
-   stack before calling _dl_fixup.  */
-# define DL_RUNTIME_UNALIGNED_VEC_SIZE 8
-#endif
-
-/* True if _dl_runtime_resolve should align stack to VEC_SIZE bytes.  */
+/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
+   stack to 16 bytes before calling _dl_fixup.  */
 #define DL_RUNTIME_RESOLVE_REALIGN_STACK \
-  (VEC_SIZE > DL_STACK_ALIGNMENT \
-   && VEC_SIZE > DL_RUNTIME_UNALIGNED_VEC_SIZE)
-
-/* Align vector register save area to 16 bytes.  */
-#define REGISTER_SAVE_VEC_OFF	0
+  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
+   || 16 > DL_STACK_ALIGNMENT)
 
 /* Area on stack to save and restore registers used for parameter
    passing when calling _dl_fixup.  */
 #ifdef __ILP32__
-# define REGISTER_SAVE_RAX	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
 # define PRESERVE_BND_REGS_PREFIX
 #else
-/* Align bound register save area to 16 bytes.  */
-# define REGISTER_SAVE_BND0	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
-# define REGISTER_SAVE_BND1	(REGISTER_SAVE_BND0 + 16)
-# define REGISTER_SAVE_BND2	(REGISTER_SAVE_BND1 + 16)
-# define REGISTER_SAVE_BND3	(REGISTER_SAVE_BND2 + 16)
-# define REGISTER_SAVE_RAX	(REGISTER_SAVE_BND3 + 16)
 # ifdef HAVE_MPX_SUPPORT
 #  define PRESERVE_BND_REGS_PREFIX bnd
 # else
 #  define PRESERVE_BND_REGS_PREFIX .byte 0xf2
 # endif
 #endif
+#define REGISTER_SAVE_RAX	0
 #define REGISTER_SAVE_RCX	(REGISTER_SAVE_RAX + 8)
 #define REGISTER_SAVE_RDX	(REGISTER_SAVE_RCX + 8)
 #define REGISTER_SAVE_RSI	(REGISTER_SAVE_RDX + 8)
@@ -80,68 +63,56 @@
 
 #define VEC_SIZE		64
 #define VMOVA			vmovdqa64
-#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
-# define VMOV			vmovdqa64
-#else
-# define VMOV			vmovdqu64
-#endif
 #define VEC(i)			zmm##i
-#define _dl_runtime_resolve	_dl_runtime_resolve_avx512
 #define _dl_runtime_profile	_dl_runtime_profile_avx512
 #include "dl-trampoline.h"
-#undef _dl_runtime_resolve
 #undef _dl_runtime_profile
 #undef VEC
-#undef VMOV
 #undef VMOVA
 #undef VEC_SIZE
 
 #define VEC_SIZE		32
 #define VMOVA			vmovdqa
-#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
-# define VMOV			vmovdqa
-#else
-# define VMOV			vmovdqu
-#endif
 #define VEC(i)			ymm##i
-#define _dl_runtime_resolve	_dl_runtime_resolve_avx
-#define _dl_runtime_resolve_opt	_dl_runtime_resolve_avx_opt
 #define _dl_runtime_profile	_dl_runtime_profile_avx
 #include "dl-trampoline.h"
-#undef _dl_runtime_resolve
-#undef _dl_runtime_resolve_opt
 #undef _dl_runtime_profile
 #undef VEC
-#undef VMOV
 #undef VMOVA
 #undef VEC_SIZE
 
 /* movaps/movups is 1-byte shorter.  */
 #define VEC_SIZE		16
 #define VMOVA			movaps
-#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
-# define VMOV			movaps
-#else
-# define VMOV			movups
-#endif
 #define VEC(i)			xmm##i
-#define _dl_runtime_resolve	_dl_runtime_resolve_sse
 #define _dl_runtime_profile	_dl_runtime_profile_sse
 #undef RESTORE_AVX
 #include "dl-trampoline.h"
-#undef _dl_runtime_resolve
 #undef _dl_runtime_profile
-#undef VMOV
+#undef VEC
 #undef VMOVA
+#undef VEC_SIZE
 
-/* Used by _dl_runtime_resolve_avx_opt/_dl_runtime_resolve_avx512_opt
-   to preserve the full vector registers with zero upper bits.  */
-#define VMOVA			vmovdqa
-#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
-# define VMOV			vmovdqa
-#else
-# define VMOV			vmovdqu
-#endif
-#define _dl_runtime_resolve	_dl_runtime_resolve_sse_vex
-#define _dl_runtime_resolve_opt	_dl_runtime_resolve_avx512_opt
+#define USE_FXSAVE
+#define STATE_SAVE_ALIGNMENT	16
+#define _dl_runtime_resolve	_dl_runtime_resolve_fxsave
+#include "dl-trampoline.h"
+#undef _dl_runtime_resolve
+#undef USE_FXSAVE
+#undef STATE_SAVE_ALIGNMENT
+
+#define USE_XSAVE
+#define STATE_SAVE_ALIGNMENT	64
+#define _dl_runtime_resolve	_dl_runtime_resolve_xsave
+#include "dl-trampoline.h"
+#undef _dl_runtime_resolve
+#undef USE_XSAVE
+#undef STATE_SAVE_ALIGNMENT
+
+#define USE_XSAVEC
+#define STATE_SAVE_ALIGNMENT	64
+#define _dl_runtime_resolve	_dl_runtime_resolve_xsavec
 #include "dl-trampoline.h"
+#undef _dl_runtime_resolve
+#undef USE_XSAVEC
+#undef STATE_SAVE_ALIGNMENT
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index 32ad3af202..b9c2f1796f 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -16,140 +16,47 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#undef REGISTER_SAVE_AREA_RAW
-#ifdef __ILP32__
-/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as VEC0 to
-   VEC7.  */
-# define REGISTER_SAVE_AREA_RAW	(8 * 7 + VEC_SIZE * 8)
-#else
-/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
-   BND0, BND1, BND2, BND3 and VEC0 to VEC7. */
-# define REGISTER_SAVE_AREA_RAW	(8 * 7 + 16 * 4 + VEC_SIZE * 8)
-#endif
+	.text
+#ifdef _dl_runtime_resolve
 
-#undef REGISTER_SAVE_AREA
-#undef LOCAL_STORAGE_AREA
-#undef BASE
-#if DL_RUNTIME_RESOLVE_REALIGN_STACK
-# define REGISTER_SAVE_AREA	(REGISTER_SAVE_AREA_RAW + 8)
-/* Local stack area before jumping to function address: RBX.  */
-# define LOCAL_STORAGE_AREA	8
-# define BASE			rbx
-# if (REGISTER_SAVE_AREA % VEC_SIZE) != 0
-#  error REGISTER_SAVE_AREA must be multples of VEC_SIZE
-# endif
-#else
-# define REGISTER_SAVE_AREA	REGISTER_SAVE_AREA_RAW
-/* Local stack area before jumping to function address:  All saved
-   registers.  */
-# define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
-# define BASE			rsp
-# if (REGISTER_SAVE_AREA % 16) != 8
-#  error REGISTER_SAVE_AREA must be odd multples of 8
-# endif
-#endif
+# undef REGISTER_SAVE_AREA
+# undef LOCAL_STORAGE_AREA
+# undef BASE
 
-	.text
-#ifdef _dl_runtime_resolve_opt
-/* Use the smallest vector registers to preserve the full YMM/ZMM
-   registers to avoid SSE transition penalty.  */
+# if (STATE_SAVE_ALIGNMENT % 16) != 0
+#  error STATE_SAVE_ALIGNMENT must be multples of 16
+# endif
 
-# if VEC_SIZE == 32
-/* Check if the upper 128 bits in %ymm0 - %ymm7 registers are non-zero
-   and preserve %xmm0 - %xmm7 registers with the zero upper bits.  Since
-   there is no SSE transition penalty on AVX512 processors which don't
-   support XGETBV with ECX == 1, _dl_runtime_resolve_avx512_slow isn't
-   provided.   */
-	.globl _dl_runtime_resolve_avx_slow
-	.hidden _dl_runtime_resolve_avx_slow
-	.type _dl_runtime_resolve_avx_slow, @function
-	.align 16
-_dl_runtime_resolve_avx_slow:
-	cfi_startproc
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
-	vorpd %ymm0, %ymm1, %ymm8
-	vorpd %ymm2, %ymm3, %ymm9
-	vorpd %ymm4, %ymm5, %ymm10
-	vorpd %ymm6, %ymm7, %ymm11
-	vorpd %ymm8, %ymm9, %ymm9
-	vorpd %ymm10, %ymm11, %ymm10
-	vpcmpeqd %xmm8, %xmm8, %xmm8
-	vorpd %ymm9, %ymm10, %ymm10
-	vptest %ymm10, %ymm8
-	# Preserve %ymm0 - %ymm7 registers if the upper 128 bits of any
-	# %ymm0 - %ymm7 registers aren't zero.
-	PRESERVE_BND_REGS_PREFIX
-	jnc _dl_runtime_resolve_avx
-	# Use vzeroupper to avoid SSE transition penalty.
-	vzeroupper
-	# Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits
-	# when the upper 128 bits of %ymm0 - %ymm7 registers are zero.
-	PRESERVE_BND_REGS_PREFIX
-	jmp _dl_runtime_resolve_sse_vex
-	cfi_adjust_cfa_offset(-16) # Restore PLT adjustment
-	cfi_endproc
-	.size _dl_runtime_resolve_avx_slow, .-_dl_runtime_resolve_avx_slow
+# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
+#  error STATE_SAVE_OFFSET must be multples of STATE_SAVE_ALIGNMENT
 # endif
 
-/* Use XGETBV with ECX == 1 to check which bits in vector registers are
-   non-zero and only preserve the non-zero lower bits with zero upper
-   bits.  */
-	.globl _dl_runtime_resolve_opt
-	.hidden _dl_runtime_resolve_opt
-	.type _dl_runtime_resolve_opt, @function
-	.align 16
-_dl_runtime_resolve_opt:
-	cfi_startproc
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
-	pushq %rax
-	cfi_adjust_cfa_offset(8)
-	cfi_rel_offset(%rax, 0)
-	pushq %rcx
-	cfi_adjust_cfa_offset(8)
-	cfi_rel_offset(%rcx, 0)
-	pushq %rdx
-	cfi_adjust_cfa_offset(8)
-	cfi_rel_offset(%rdx, 0)
-	movl $1, %ecx
-	xgetbv
-	movl %eax, %r11d
-	popq %rdx
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore (%rdx)
-	popq %rcx
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore (%rcx)
-	popq %rax
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore (%rax)
-# if VEC_SIZE == 32
-	# For YMM registers, check if YMM state is in use.
-	andl $bit_YMM_state, %r11d
-	# Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits if
-	# YMM state isn't in use.
-	PRESERVE_BND_REGS_PREFIX
-	jz _dl_runtime_resolve_sse_vex
-# elif VEC_SIZE == 16
-	# For ZMM registers, check if YMM state and ZMM state are in
-	# use.
-	andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d
-	cmpl $bit_YMM_state, %r11d
-	# Preserve %zmm0 - %zmm7 registers if ZMM state is in use.
-	PRESERVE_BND_REGS_PREFIX
-	jg _dl_runtime_resolve_avx512
-	# Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if
-	# ZMM state isn't in use.
-	PRESERVE_BND_REGS_PREFIX
-	je _dl_runtime_resolve_avx
-	# Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
-	# neither YMM state nor ZMM state are in use.
+# if DL_RUNTIME_RESOLVE_REALIGN_STACK
+/* Local stack area before jumping to function address: RBX.  */
+#  define LOCAL_STORAGE_AREA	8
+#  define BASE			rbx
+#  ifdef USE_FXSAVE
+/* Use fxsave to save XMM registers.  */
+#   define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET)
+#   if (REGISTER_SAVE_AREA % 16) != 0
+#    error REGISTER_SAVE_AREA must be multples of 16
+#   endif
+#  endif
 # else
-#  error Unsupported VEC_SIZE!
+#  ifndef USE_FXSAVE
+#   error USE_FXSAVE must be defined
+#  endif
+/* Use fxsave to save XMM registers.  */
+#  define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET + 8)
+/* Local stack area before jumping to function address:  All saved
+   registers.  */
+#  define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
+#  define BASE			rsp
+#  if (REGISTER_SAVE_AREA % 16) != 8
+#   error REGISTER_SAVE_AREA must be odd multples of 8
+#  endif
 # endif
-	cfi_adjust_cfa_offset(-16) # Restore PLT adjustment
-	cfi_endproc
-	.size _dl_runtime_resolve_opt, .-_dl_runtime_resolve_opt
-#endif
+
 	.globl _dl_runtime_resolve
 	.hidden _dl_runtime_resolve
 	.type _dl_runtime_resolve, @function
@@ -157,19 +64,30 @@ _dl_runtime_resolve_opt:
 	cfi_startproc
 _dl_runtime_resolve:
 	cfi_adjust_cfa_offset(16) # Incorporate PLT
-#if DL_RUNTIME_RESOLVE_REALIGN_STACK
-# if LOCAL_STORAGE_AREA != 8
-#  error LOCAL_STORAGE_AREA must be 8
-# endif
+# if DL_RUNTIME_RESOLVE_REALIGN_STACK
+#  if LOCAL_STORAGE_AREA != 8
+#   error LOCAL_STORAGE_AREA must be 8
+#  endif
 	pushq %rbx			# push subtracts stack by 8.
 	cfi_adjust_cfa_offset(8)
 	cfi_rel_offset(%rbx, 0)
 	mov %RSP_LP, %RBX_LP
 	cfi_def_cfa_register(%rbx)
-	and $-VEC_SIZE, %RSP_LP
-#endif
+	and $-STATE_SAVE_ALIGNMENT, %RSP_LP
+# endif
+# ifdef REGISTER_SAVE_AREA
 	sub $REGISTER_SAVE_AREA, %RSP_LP
+#  if !DL_RUNTIME_RESOLVE_REALIGN_STACK
 	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
+#  endif
+# else
+	# Allocate stack space of the required size to save the state.
+#  if IS_IN (rtld)
+	sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
+#  else
+	sub _dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
+#  endif
+# endif
 	# Preserve registers otherwise clobbered.
 	movq %rax, REGISTER_SAVE_RAX(%rsp)
 	movq %rcx, REGISTER_SAVE_RCX(%rsp)
@@ -178,59 +96,42 @@ _dl_runtime_resolve:
 	movq %rdi, REGISTER_SAVE_RDI(%rsp)
 	movq %r8, REGISTER_SAVE_R8(%rsp)
 	movq %r9, REGISTER_SAVE_R9(%rsp)
-	VMOV %VEC(0), (REGISTER_SAVE_VEC_OFF)(%rsp)
-	VMOV %VEC(1), (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp)
-	VMOV %VEC(2), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp)
-	VMOV %VEC(3), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp)
-	VMOV %VEC(4), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp)
-	VMOV %VEC(5), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp)
-	VMOV %VEC(6), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp)
-	VMOV %VEC(7), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp)
-#ifndef __ILP32__
-	# We also have to preserve bound registers.  These are nops if
-	# Intel MPX isn't available or disabled.
-# ifdef HAVE_MPX_SUPPORT
-	bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
-	bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
-	bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
-	bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
+# ifdef USE_FXSAVE
+	fxsave STATE_SAVE_OFFSET(%rsp)
 # else
-#  if REGISTER_SAVE_BND0 == 0
-	.byte 0x66,0x0f,0x1b,0x04,0x24
+	movl $STATE_SAVE_MASK, %eax
+	xorl %edx, %edx
+	# Clear the XSAVE Header.
+#  ifdef USE_XSAVE
+	movq %rdx, (STATE_SAVE_OFFSET + 512)(%rsp)
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp)
+#  endif
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp)
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp)
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp)
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp)
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp)
+	movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp)
+#  ifdef USE_XSAVE
+	xsave STATE_SAVE_OFFSET(%rsp)
 #  else
-	.byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
+	xsavec STATE_SAVE_OFFSET(%rsp)
 #  endif
-	.byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
-	.byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
-	.byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
 # endif
-#endif
 	# Copy args pushed by PLT in register.
 	# %rdi: link_map, %rsi: reloc_index
 	mov (LOCAL_STORAGE_AREA + 8)(%BASE), %RSI_LP
 	mov LOCAL_STORAGE_AREA(%BASE), %RDI_LP
 	call _dl_fixup		# Call resolver.
 	mov %RAX_LP, %R11_LP	# Save return value
-#ifndef __ILP32__
-	# Restore bound registers.  These are nops if Intel MPX isn't
-	# avaiable or disabled.
-# ifdef HAVE_MPX_SUPPORT
-	bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
-	bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
-	bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
-	bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
+	# Get register content back.
+# ifdef USE_FXSAVE
+	fxrstor STATE_SAVE_OFFSET(%rsp)
 # else
-	.byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
-	.byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
-	.byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
-#  if REGISTER_SAVE_BND0 == 0
-	.byte 0x66,0x0f,0x1a,0x04,0x24
-#  else
-	.byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
-#  endif
+	movl $STATE_SAVE_MASK, %eax
+	xorl %edx, %edx
+	xrstor STATE_SAVE_OFFSET(%rsp)
 # endif
-#endif
-	# Get register content back.
 	movq REGISTER_SAVE_R9(%rsp), %r9
 	movq REGISTER_SAVE_R8(%rsp), %r8
 	movq REGISTER_SAVE_RDI(%rsp), %rdi
@@ -238,20 +139,12 @@ _dl_runtime_resolve:
 	movq REGISTER_SAVE_RDX(%rsp), %rdx
 	movq REGISTER_SAVE_RCX(%rsp), %rcx
 	movq REGISTER_SAVE_RAX(%rsp), %rax
-	VMOV (REGISTER_SAVE_VEC_OFF)(%rsp), %VEC(0)
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp), %VEC(1)
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp), %VEC(2)
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp), %VEC(3)
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp), %VEC(4)
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp), %VEC(5)
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp), %VEC(6)
-	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp), %VEC(7)
-#if DL_RUNTIME_RESOLVE_REALIGN_STACK
+# if DL_RUNTIME_RESOLVE_REALIGN_STACK
 	mov %RBX_LP, %RSP_LP
 	cfi_def_cfa_register(%rsp)
 	movq (%rsp), %rbx
 	cfi_restore(%rbx)
-#endif
+# endif
 	# Adjust stack(PLT did 2 pushes)
 	add $(LOCAL_STORAGE_AREA + 16), %RSP_LP
 	cfi_adjust_cfa_offset(-(LOCAL_STORAGE_AREA + 16))
@@ -260,11 +153,9 @@ _dl_runtime_resolve:
 	jmp *%r11		# Jump to function address.
 	cfi_endproc
 	.size _dl_runtime_resolve, .-_dl_runtime_resolve
+#endif
 
 
-/* To preserve %xmm0 - %xmm7 registers, dl-trampoline.h is included
-   twice, for _dl_runtime_resolve_sse and _dl_runtime_resolve_sse_vex.
-   But we don't need another _dl_runtime_profile for XMM registers.  */
 #if !defined PROF && defined _dl_runtime_profile
 # if (LR_VECTOR_OFFSET % VEC_SIZE) != 0
 #  error LR_VECTOR_OFFSET must be multples of VEC_SIZE
diff --git a/sysdeps/x86_64/rtld-offsets.sym b/sysdeps/x86_64/rtld-offsets.sym
new file mode 100644
index 0000000000..fd41b51521
--- /dev/null
+++ b/sysdeps/x86_64/rtld-offsets.sym
@@ -0,0 +1,6 @@
+#define SHARED
+#include <ldsodefs.h>
+
+--
+
+GL_TLS_GENERATION_OFFSET        offsetof (struct rtld_global, _dl_tls_generation)
diff --git a/sysdeps/x86_64/tls_get_addr.S b/sysdeps/x86_64/tls_get_addr.S
new file mode 100644
index 0000000000..9d38fb3be5
--- /dev/null
+++ b/sysdeps/x86_64/tls_get_addr.S
@@ -0,0 +1,61 @@
+/* Stack-aligning implementation of __tls_get_addr.  x86-64 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef SHARED
+
+# include <sysdep.h>
+# include "tlsdesc.h"
+# include "rtld-offsets.h"
+
+/* See __tls_get_addr and __tls_get_addr_slow in dl-tls.c.  This function
+   call __tls_get_addr_slow on both slow paths.  It realigns the stack
+   before the call to work around GCC PR58066.  */
+
+ENTRY (__tls_get_addr)
+	mov 	%fs:DTV_OFFSET, %RDX_LP
+	mov	GL_TLS_GENERATION_OFFSET+_rtld_local(%rip), %RAX_LP
+	/* GL(dl_tls_generation) == dtv[0].counter */
+	cmp	%RAX_LP, (%rdx)
+	jne	1f
+	mov	TI_MODULE_OFFSET(%rdi), %RAX_LP
+	/* dtv[ti->ti_module] */
+# ifdef __LP64__
+	salq	$4, %rax
+	movq	(%rdx,%rax), %rax
+# else
+	movl	(%rdx,%rax, 8), %eax
+# endif
+	cmp	$-1, %RAX_LP
+	je	1f
+	add	TI_OFFSET_OFFSET(%rdi), %RAX_LP
+	ret
+1:
+	/* On the slow path, align the stack.  */
+	pushq	%rbp
+	cfi_def_cfa_offset (16)
+	cfi_offset (%rbp, -16)
+	mov	%RSP_LP, %RBP_LP
+	cfi_def_cfa_register (%rbp)
+	and	$-16, %RSP_LP
+	call	__tls_get_addr_slow
+	mov	%RBP_LP, %RSP_LP
+	popq	%rbp
+	cfi_def_cfa (%rsp, 8)
+	ret
+END (__tls_get_addr)
+#endif /* SHARED */
diff --git a/sysdeps/x86_64/tlsdesc.sym b/sysdeps/x86_64/tlsdesc.sym
index 33854975d0..fc897ab4b5 100644
--- a/sysdeps/x86_64/tlsdesc.sym
+++ b/sysdeps/x86_64/tlsdesc.sym
@@ -15,3 +15,6 @@ TLSDESC_ARG			offsetof(struct tlsdesc, arg)
 TLSDESC_GEN_COUNT		offsetof(struct tlsdesc_dynamic_arg, gen_count)
 TLSDESC_MODID			offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module)
 TLSDESC_MODOFF			offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset)
+
+TI_MODULE_OFFSET 		offsetof(tls_index, ti_module)
+TI_OFFSET_OFFSET 		offsetof(tls_index, ti_offset)
diff --git a/sysdeps/x86_64/tst-avx-aux.c b/sysdeps/x86_64/tst-avx-aux.c
new file mode 100644
index 0000000000..e3807de7bb
--- /dev/null
+++ b/sysdeps/x86_64/tst-avx-aux.c
@@ -0,0 +1,47 @@
+/* Test case for preserved AVX registers in dynamic linker, -mavx part.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <immintrin.h>
+#include <stdlib.h>
+#include <string.h>
+
+int
+tst_avx_aux (void)
+{
+#ifdef __AVX__
+  extern __m256i avx_test (__m256i, __m256i, __m256i, __m256i,
+			   __m256i, __m256i, __m256i, __m256i);
+
+  __m256i ymm0 = _mm256_set1_epi32 (0);
+  __m256i ymm1 = _mm256_set1_epi32 (1);
+  __m256i ymm2 = _mm256_set1_epi32 (2);
+  __m256i ymm3 = _mm256_set1_epi32 (3);
+  __m256i ymm4 = _mm256_set1_epi32 (4);
+  __m256i ymm5 = _mm256_set1_epi32 (5);
+  __m256i ymm6 = _mm256_set1_epi32 (6);
+  __m256i ymm7 = _mm256_set1_epi32 (7);
+  __m256i ret = avx_test (ymm0, ymm1, ymm2, ymm3,
+			  ymm4, ymm5, ymm6, ymm7);
+  ymm0 =  _mm256_set1_epi32 (0x12349876);
+  if (memcmp (&ymm0, &ret, sizeof (ret)))
+    abort ();
+  return 0;
+#else  /* __AVX__ */
+  return 77;
+#endif  /* __AVX__ */
+}
diff --git a/sysdeps/x86_64/tst-avx.c b/sysdeps/x86_64/tst-avx.c
new file mode 100644
index 0000000000..ec2e3a79ff
--- /dev/null
+++ b/sysdeps/x86_64/tst-avx.c
@@ -0,0 +1,49 @@
+/* Test case for preserved AVX registers in dynamic linker.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <cpuid.h>
+
+int tst_avx_aux (void);
+
+static int
+avx_enabled (void)
+{
+  unsigned int eax, ebx, ecx, edx;
+
+  if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) == 0
+      || (ecx & (bit_AVX | bit_OSXSAVE)) != (bit_AVX | bit_OSXSAVE))
+    return 0;
+
+  /* Check the OS has AVX and SSE saving enabled.  */
+  asm ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0));
+
+  return (eax & 6) == 6;
+}
+
+static int
+do_test (void)
+{
+  /* Run AVX test only if AVX is supported.  */
+  if (avx_enabled ())
+    return tst_avx_aux ();
+  else
+    return 77;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../../test-skeleton.c"
diff --git a/sysdeps/x86_64/tst-avx512-aux.c b/sysdeps/x86_64/tst-avx512-aux.c
new file mode 100644
index 0000000000..6cebc523f2
--- /dev/null
+++ b/sysdeps/x86_64/tst-avx512-aux.c
@@ -0,0 +1,48 @@
+/* Test case for preserved AVX512 registers in dynamic linker,
+   -mavx512 part.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <immintrin.h>
+#include <stdlib.h>
+#include <string.h>
+
+int
+tst_avx512_aux (void)
+{
+#ifdef __AVX512F__
+  extern __m512i avx512_test (__m512i, __m512i, __m512i, __m512i,
+			      __m512i, __m512i, __m512i, __m512i);
+
+  __m512i zmm0 = _mm512_set1_epi32 (0);
+  __m512i zmm1 = _mm512_set1_epi32 (1);
+  __m512i zmm2 = _mm512_set1_epi32 (2);
+  __m512i zmm3 = _mm512_set1_epi32 (3);
+  __m512i zmm4 = _mm512_set1_epi32 (4);
+  __m512i zmm5 = _mm512_set1_epi32 (5);
+  __m512i zmm6 = _mm512_set1_epi32 (6);
+  __m512i zmm7 = _mm512_set1_epi32 (7);
+  __m512i ret = avx512_test (zmm0, zmm1, zmm2, zmm3,
+			     zmm4, zmm5, zmm6, zmm7);
+  zmm0 =  _mm512_set1_epi32 (0x12349876);
+  if (memcmp (&zmm0, &ret, sizeof (ret)))
+    abort ();
+  return 0;
+#else  /* __AVX512F__ */
+  return 77;
+#endif  /* __AVX512F__ */
+}
diff --git a/sysdeps/x86_64/tst-avx512.c b/sysdeps/x86_64/tst-avx512.c
new file mode 100644
index 0000000000..a8e42ef553
--- /dev/null
+++ b/sysdeps/x86_64/tst-avx512.c
@@ -0,0 +1,57 @@
+/* Test case for preserved AVX512 registers in dynamic linker.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <cpuid.h>
+
+int tst_avx512_aux (void);
+
+static int
+avx512_enabled (void)
+{
+#ifdef bit_AVX512F
+  unsigned int eax, ebx, ecx, edx;
+
+  if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) == 0
+      || (ecx & (bit_AVX | bit_OSXSAVE)) != (bit_AVX | bit_OSXSAVE))
+    return 0;
+
+  __cpuid_count (7, 0, eax, ebx, ecx, edx);
+  if (!(ebx & bit_AVX512F))
+    return 0;
+
+  asm ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0));
+
+  /* Verify that ZMM, YMM and XMM states are enabled.  */
+  return (eax & 0xe6) == 0xe6;
+#else
+  return 0;
+#endif
+}
+
+static int
+do_test (void)
+{
+  /* Run AVX512 test only if AVX512 is supported.  */
+  if (avx512_enabled ())
+    return tst_avx512_aux ();
+  else
+    return 77;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../../test-skeleton.c"
diff --git a/sysdeps/x86_64/tst-avx512mod.c b/sysdeps/x86_64/tst-avx512mod.c
new file mode 100644
index 0000000000..4cfb3a2c3d
--- /dev/null
+++ b/sysdeps/x86_64/tst-avx512mod.c
@@ -0,0 +1,48 @@
+/* Test case for x86-64 preserved AVX512 registers in dynamic linker.  */
+
+#ifdef __AVX512F__
+#include <stdlib.h>
+#include <string.h>
+#include <immintrin.h>
+
+__m512i
+avx512_test (__m512i x0, __m512i x1, __m512i x2, __m512i x3,
+	     __m512i x4, __m512i x5, __m512i x6, __m512i x7)
+{
+  __m512i zmm;
+
+  zmm = _mm512_set1_epi32 (0);
+  if (memcmp (&zmm, &x0, sizeof (zmm)))
+    abort ();
+
+  zmm = _mm512_set1_epi32 (1);
+  if (memcmp (&zmm, &x1, sizeof (zmm)))
+    abort ();
+
+  zmm = _mm512_set1_epi32 (2);
+  if (memcmp (&zmm, &x2, sizeof (zmm)))
+    abort ();
+
+  zmm = _mm512_set1_epi32 (3);
+  if (memcmp (&zmm, &x3, sizeof (zmm)))
+    abort ();
+
+  zmm = _mm512_set1_epi32 (4);
+  if (memcmp (&zmm, &x4, sizeof (zmm)))
+    abort ();
+
+  zmm = _mm512_set1_epi32 (5);
+  if (memcmp (&zmm, &x5, sizeof (zmm)))
+    abort ();
+
+  zmm = _mm512_set1_epi32 (6);
+  if (memcmp (&zmm, &x6, sizeof (zmm)))
+    abort ();
+
+  zmm = _mm512_set1_epi32 (7);
+  if (memcmp (&zmm, &x7, sizeof (zmm)))
+    abort ();
+
+  return _mm512_set1_epi32 (0x12349876);
+}
+#endif
diff --git a/sysdeps/x86_64/tst-avxmod.c b/sysdeps/x86_64/tst-avxmod.c
new file mode 100644
index 0000000000..6e5b154997
--- /dev/null
+++ b/sysdeps/x86_64/tst-avxmod.c
@@ -0,0 +1,48 @@
+/* Test case for x86-64 preserved AVX registers in dynamic linker.  */
+
+#ifdef __AVX__
+#include <stdlib.h>
+#include <string.h>
+#include <immintrin.h>
+
+__m256i
+avx_test (__m256i x0, __m256i x1, __m256i x2, __m256i x3,
+	  __m256i x4, __m256i x5, __m256i x6, __m256i x7)
+{
+  __m256i ymm;
+
+  ymm = _mm256_set1_epi32 (0);
+  if (memcmp (&ymm, &x0, sizeof (ymm)))
+    abort ();
+
+  ymm = _mm256_set1_epi32 (1);
+  if (memcmp (&ymm, &x1, sizeof (ymm)))
+    abort ();
+
+  ymm = _mm256_set1_epi32 (2);
+  if (memcmp (&ymm, &x2, sizeof (ymm)))
+    abort ();
+
+  ymm = _mm256_set1_epi32 (3);
+  if (memcmp (&ymm, &x3, sizeof (ymm)))
+    abort ();
+
+  ymm = _mm256_set1_epi32 (4);
+  if (memcmp (&ymm, &x4, sizeof (ymm)))
+    abort ();
+
+  ymm = _mm256_set1_epi32 (5);
+  if (memcmp (&ymm, &x5, sizeof (ymm)))
+    abort ();
+
+  ymm = _mm256_set1_epi32 (6);
+  if (memcmp (&ymm, &x6, sizeof (ymm)))
+    abort ();
+
+  ymm = _mm256_set1_epi32 (7);
+  if (memcmp (&ymm, &x7, sizeof (ymm)))
+    abort ();
+
+  return _mm256_set1_epi32 (0x12349876);
+}
+#endif
diff --git a/sysdeps/x86_64/tst-sse.c b/sysdeps/x86_64/tst-sse.c
new file mode 100644
index 0000000000..dd1537cf27
--- /dev/null
+++ b/sysdeps/x86_64/tst-sse.c
@@ -0,0 +1,46 @@
+/* Test case for preserved SSE registers in dynamic linker.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <immintrin.h>
+#include <stdlib.h>
+#include <string.h>
+
+extern __m128i sse_test (__m128i, __m128i, __m128i, __m128i,
+			 __m128i, __m128i, __m128i, __m128i);
+
+static int
+do_test (void)
+{
+  __m128i xmm0 = _mm_set1_epi32 (0);
+  __m128i xmm1 = _mm_set1_epi32 (1);
+  __m128i xmm2 = _mm_set1_epi32 (2);
+  __m128i xmm3 = _mm_set1_epi32 (3);
+  __m128i xmm4 = _mm_set1_epi32 (4);
+  __m128i xmm5 = _mm_set1_epi32 (5);
+  __m128i xmm6 = _mm_set1_epi32 (6);
+  __m128i xmm7 = _mm_set1_epi32 (7);
+  __m128i ret = sse_test (xmm0, xmm1, xmm2, xmm3,
+			  xmm4, xmm5, xmm6, xmm7);
+  xmm0 =  _mm_set1_epi32 (0x12349876);
+  if (memcmp (&xmm0, &ret, sizeof (ret)))
+    abort ();
+  return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../../test-skeleton.c"
diff --git a/sysdeps/x86_64/tst-ssemod.c b/sysdeps/x86_64/tst-ssemod.c
new file mode 100644
index 0000000000..907a64c69e
--- /dev/null
+++ b/sysdeps/x86_64/tst-ssemod.c
@@ -0,0 +1,46 @@
+/* Test case for x86-64 preserved SSE registers in dynamic linker.  */
+
+#include <stdlib.h>
+#include <string.h>
+#include <immintrin.h>
+
+__m128i
+sse_test (__m128i x0, __m128i x1, __m128i x2, __m128i x3,
+	  __m128i x4, __m128i x5, __m128i x6, __m128i x7)
+{
+  __m128i xmm;
+
+  xmm = _mm_set1_epi32 (0);
+  if (memcmp (&xmm, &x0, sizeof (xmm)))
+    abort ();
+
+  xmm = _mm_set1_epi32 (1);
+  if (memcmp (&xmm, &x1, sizeof (xmm)))
+    abort ();
+
+  xmm = _mm_set1_epi32 (2);
+  if (memcmp (&xmm, &x2, sizeof (xmm)))
+    abort ();
+
+  xmm = _mm_set1_epi32 (3);
+  if (memcmp (&xmm, &x3, sizeof (xmm)))
+    abort ();
+
+  xmm = _mm_set1_epi32 (4);
+  if (memcmp (&xmm, &x4, sizeof (xmm)))
+    abort ();
+
+  xmm = _mm_set1_epi32 (5);
+  if (memcmp (&xmm, &x5, sizeof (xmm)))
+    abort ();
+
+  xmm = _mm_set1_epi32 (6);
+  if (memcmp (&xmm, &x6, sizeof (xmm)))
+    abort ();
+
+  xmm = _mm_set1_epi32 (7);
+  if (memcmp (&xmm, &x7, sizeof (xmm)))
+    abort ();
+
+  return _mm_set1_epi32 (0x12349876);
+}