aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2021-03-05 06:46:08 -0800
committerH.J. Lu <hjl.tools@gmail.com>2022-01-27 12:47:19 -0800
commit5141ddbe3aace5c713d6e2c4260cd1646e54489b (patch)
tree704d4bbbd732c08513535ac6364ec27d8edbf39a
parenta3a8109999f86bd7f968aa99c3fa7e2d3a7c6de8 (diff)
downloadglibc-5141ddbe3aace5c713d6e2c4260cd1646e54489b.tar
glibc-5141ddbe3aace5c713d6e2c4260cd1646e54489b.tar.gz
glibc-5141ddbe3aace5c713d6e2c4260cd1646e54489b.tar.bz2
glibc-5141ddbe3aace5c713d6e2c4260cd1646e54489b.zip
x86-64: Add memmove family functions with 256-bit EVEX
Update ifunc-memmove.h to select the function optimized with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM abort with usable AVX512VL since VZEROUPPER isn't needed at function exit. (cherry picked from commit 63ad43566f7a25d140dc723598aeb441ad657eed)
-rw-r--r--sysdeps/x86_64/multiarch/Makefile1
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-impl-list.c36
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-memmove.h21
-rw-r--r--sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S26
-rw-r--r--sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S24
5 files changed, 97 insertions, 11 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 38349a690b..e75805a645 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -42,6 +42,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
memset-avx2-unaligned-erms \
memset-avx512-unaligned-erms \
memchr-evex \
+ memmove-evex-unaligned-erms \
memrchr-evex \
rawmemchr-evex \
stpcpy-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 680c2c070f..d7814a965f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -81,6 +81,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX_Usable),
__memmove_chk_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __memmove_chk_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __memmove_chk_evex_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_CPU_FEATURE (SSSE3),
__memmove_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
@@ -103,6 +109,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX_Usable),
__memmove_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memmove,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __memmove_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, memmove,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __memmove_evex_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memmove,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memmove_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, memmove,
@@ -554,6 +566,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX_Usable),
__memcpy_chk_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __memcpy_chk_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __memcpy_chk_evex_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__memcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
@@ -575,6 +593,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy,
HAS_ARCH_FEATURE (AVX_Usable),
__memcpy_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __memcpy_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __memcpy_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
__memcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
@@ -612,6 +636,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_chk_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __mempcpy_chk_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __mempcpy_chk_evex_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__mempcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
@@ -642,6 +672,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, mempcpy,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __mempcpy_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __mempcpy_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
__mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index 5b1eb1c92c..83db955826 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+ attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
@@ -59,10 +63,21 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
{
- if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
- return OPTIMIZE (avx_unaligned_erms);
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable))
+ {
+ if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ return OPTIMIZE (evex_unaligned_erms);
+
+ return OPTIMIZE (evex_unaligned);
+ }
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ {
+ if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ return OPTIMIZE (avx_unaligned_erms);
- return OPTIMIZE (avx_unaligned);
+ return OPTIMIZE (avx_unaligned);
+ }
}
if (!CPU_FEATURES_CPU_P (cpu_features, SSSE3)
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
new file mode 100644
index 0000000000..b879007e89
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
@@ -0,0 +1,26 @@
+#if IS_IN (libc)
+# define VEC_SIZE 32
+# define XMM0 xmm16
+# define XMM1 xmm17
+# define YMM0 ymm16
+# define YMM1 ymm17
+# define VEC0 ymm16
+# define VEC1 ymm17
+# define VEC2 ymm18
+# define VEC3 ymm19
+# define VEC4 ymm20
+# define VEC5 ymm21
+# define VEC6 ymm22
+# define VEC7 ymm23
+# define VEC8 ymm24
+# define VEC(i) VEC##i
+# define VMOVNT vmovntdq
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+# define VZEROUPPER
+
+# define SECTION(p) p##.evex
+# define MEMMOVE_SYMBOL(p,s) p##_evex_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 5aaadc233f..50fffeb5ce 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -48,6 +48,14 @@
# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
#endif
+#ifndef XMM0
+# define XMM0 xmm0
+#endif
+
+#ifndef YMM0
+# define YMM0 ymm0
+#endif
+
#ifndef VZEROUPPER
# if VEC_SIZE > 16
# define VZEROUPPER vzeroupper
@@ -312,20 +320,20 @@ L(less_vec):
#if VEC_SIZE > 32
L(between_32_63):
/* From 32 to 63. No branch when size == 32. */
- vmovdqu (%rsi), %ymm0
- vmovdqu -32(%rsi,%rdx), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, -32(%rdi,%rdx)
+ VMOVU (%rsi), %YMM0
+ VMOVU -32(%rsi,%rdx), %YMM1
+ VMOVU %YMM0, (%rdi)
+ VMOVU %YMM1, -32(%rdi,%rdx)
VZEROUPPER
ret
#endif
#if VEC_SIZE > 16
/* From 16 to 31. No branch when size == 16. */
L(between_16_31):
- vmovdqu (%rsi), %xmm0
- vmovdqu -16(%rsi,%rdx), %xmm1
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm1, -16(%rdi,%rdx)
+ VMOVU (%rsi), %XMM0
+ VMOVU -16(%rsi,%rdx), %XMM1
+ VMOVU %XMM0, (%rdi)
+ VMOVU %XMM1, -16(%rdi,%rdx)
ret
#endif
L(between_8_15):