aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2021-03-05 07:26:42 -0800
committerH.J. Lu <hjl.tools@gmail.com>2022-01-27 12:47:19 -0800
commit5f59aaddc9be84a6ce8c71cab90ee08dbbd72af5 (patch)
tree49bd6c60949e027f9f1b38436f7fb4ba0e569b73
parentd584356fe8417790d20f024c5233c2d35a615ea2 (diff)
downloadglibc-5f59aaddc9be84a6ce8c71cab90ee08dbbd72af5.tar
glibc-5f59aaddc9be84a6ce8c71cab90ee08dbbd72af5.tar.gz
glibc-5f59aaddc9be84a6ce8c71cab90ee08dbbd72af5.tar.bz2
glibc-5f59aaddc9be84a6ce8c71cab90ee08dbbd72af5.zip
x86-64: Add AVX optimized string/memory functions for RTM
Since VZEROUPPER triggers RTM abort while VZEROALL won't, select AVX optimized string/memory functions with xtest jz 1f vzeroall ret 1: vzeroupper ret at function exit on processors with usable RTM, but without 256-bit EVEX instructions to avoid VZEROUPPER inside a transactionally executing RTM region. (cherry picked from commit 7ebba91361badf7531d4e75050627a88d424872f)
-rw-r--r--sysdeps/x86_64/multiarch/Makefile21
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-avx2.h4
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-impl-list.c146
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-memcmp.h4
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-memmove.h12
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-memset.h12
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-wmemset.h5
-rw-r--r--sysdeps/x86_64/multiarch/memchr-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/memchr-avx2.S45
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S28
-rw-r--r--sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S17
-rw-r--r--sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S33
-rw-r--r--sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/memrchr-avx2.S53
-rw-r--r--sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S10
-rw-r--r--sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S12
-rw-r--r--sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S41
-rw-r--r--sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S4
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S4
-rw-r--r--sysdeps/x86_64/multiarch/strcat-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/strchr-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/strchr-avx2.S22
-rw-r--r--sysdeps/x86_64/multiarch/strchr.c4
-rw-r--r--sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S3
-rw-r--r--sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/strcmp-avx2.S55
-rw-r--r--sysdeps/x86_64/multiarch/strcmp.c4
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/strlen-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/strlen-avx2.S43
-rw-r--r--sysdeps/x86_64/multiarch/strncat-avx2-rtm.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncmp.c4
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S3
-rw-r--r--sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S4
-rw-r--r--sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/strrchr-avx2.S19
-rw-r--r--sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S3
-rw-r--r--sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S4
-rw-r--r--sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S4
-rw-r--r--sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S5
-rw-r--r--sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S5
-rw-r--r--sysdeps/x86_64/multiarch/wcsnlen.c4
-rw-r--r--sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S3
-rw-r--r--sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S4
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S4
-rw-r--r--sysdeps/x86_64/sysdep.h22
48 files changed, 594 insertions, 190 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 141585a984..49fe1aaef7 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -41,6 +41,19 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
memset-sse2-unaligned-erms \
memset-avx2-unaligned-erms \
memset-avx512-unaligned-erms \
+ memchr-avx2-rtm \
+ memcmp-avx2-movbe-rtm \
+ memmove-avx-unaligned-erms-rtm \
+ memrchr-avx2-rtm \
+ memset-avx2-unaligned-erms-rtm \
+ rawmemchr-avx2-rtm \
+ strchr-avx2-rtm \
+ strcmp-avx2-rtm \
+ strchrnul-avx2-rtm \
+ strlen-avx2-rtm \
+ strncmp-avx2-rtm \
+ strnlen-avx2-rtm \
+ strrchr-avx2-rtm \
memchr-evex \
memcmp-evex-movbe \
memmove-evex-unaligned-erms \
@@ -77,6 +90,14 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
wcsrchr-sse2 wcsrchr-avx2 \
wcsnlen-sse4_1 wcsnlen-c \
wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
+ wcschr-avx2-rtm \
+ wcscmp-avx2-rtm \
+ wcslen-avx2-rtm \
+ wcsncmp-avx2-rtm \
+ wcsnlen-avx2-rtm \
+ wcsrchr-avx2-rtm \
+ wmemchr-avx2-rtm \
+ wmemcmp-avx2-movbe-rtm \
wcschr-evex \
wcscmp-evex \
wcslen-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
index d7875db6e2..348d3d0531 100644
--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
+++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
@@ -21,6 +21,7 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
static inline void *
@@ -36,6 +37,9 @@ IFUNC_SELECTOR (void)
&& CPU_FEATURES_CPU_P (cpu_features, BMI2))
return OPTIMIZE (evex);
+ if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx2);
}
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 13b0599bcc..8bc42e7813 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -44,6 +44,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__memchr_avx2)
IFUNC_IMPL_ADD (array, i, memchr,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memchr_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, memchr,
(HAS_ARCH_FEATURE (AVX512VL_Usable)
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
&& HAS_CPU_FEATURE (BMI2)),
@@ -57,6 +61,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& HAS_CPU_FEATURE (MOVBE)),
__memcmp_avx2_movbe)
IFUNC_IMPL_ADD (array, i, memcmp,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (MOVBE)
+ && HAS_CPU_FEATURE (RTM)),
+ __memcmp_avx2_movbe_rtm)
+ IFUNC_IMPL_ADD (array, i, memcmp,
(HAS_ARCH_FEATURE (AVX512VL_Usable)
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
&& HAS_CPU_FEATURE (MOVBE)),
@@ -86,6 +95,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX_Usable),
__memmove_chk_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memmove_chk_avx_unaligned_rtm)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memmove_chk_avx_unaligned_erms_rtm)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_ARCH_FEATURE (AVX512VL_Usable),
__memmove_chk_evex_unaligned)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
@@ -114,6 +131,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX_Usable),
__memmove_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memmove,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memmove_avx_unaligned_rtm)
+ IFUNC_IMPL_ADD (array, i, memmove,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memmove_avx_unaligned_erms_rtm)
+ IFUNC_IMPL_ADD (array, i, memmove,
HAS_ARCH_FEATURE (AVX512VL_Usable),
__memmove_evex_unaligned)
IFUNC_IMPL_ADD (array, i, memmove,
@@ -144,6 +169,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__memrchr_avx2)
IFUNC_IMPL_ADD (array, i, memrchr,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memrchr_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, memrchr,
(HAS_ARCH_FEATURE (AVX512VL_Usable)
&& HAS_ARCH_FEATURE (AVX512BW_Usable)),
__memrchr_evex)
@@ -166,6 +195,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__memset_chk_avx2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memset_chk_avx2_unaligned_rtm)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memset_chk_avx2_unaligned_erms_rtm)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
(HAS_ARCH_FEATURE (AVX512VL_Usable)
&& HAS_ARCH_FEATURE (AVX512BW_Usable)),
__memset_chk_evex_unaligned)
@@ -199,6 +236,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__memset_avx2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memset,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memset_avx2_unaligned_rtm)
+ IFUNC_IMPL_ADD (array, i, memset,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memset_avx2_unaligned_erms_rtm)
+ IFUNC_IMPL_ADD (array, i, memset,
(HAS_ARCH_FEATURE (AVX512VL_Usable)
&& HAS_ARCH_FEATURE (AVX512BW_Usable)),
__memset_evex_unaligned)
@@ -223,6 +268,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__rawmemchr_avx2)
IFUNC_IMPL_ADD (array, i, rawmemchr,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __rawmemchr_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, rawmemchr,
(HAS_ARCH_FEATURE (AVX512VL_Usable)
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
&& HAS_CPU_FEATURE (BMI2)),
@@ -235,6 +284,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__strlen_avx2)
IFUNC_IMPL_ADD (array, i, strlen,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __strlen_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, strlen,
(HAS_ARCH_FEATURE (AVX512VL_Usable)
&& HAS_ARCH_FEATURE (AVX512BW_Usable)),
__strlen_evex)
@@ -246,6 +299,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__strnlen_avx2)
IFUNC_IMPL_ADD (array, i, strnlen,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __strnlen_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, strnlen,
(HAS_ARCH_FEATURE (AVX512VL_Usable)
&& HAS_ARCH_FEATURE (AVX512BW_Usable)),
__strnlen_evex)
@@ -318,6 +375,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__strchr_avx2)
IFUNC_IMPL_ADD (array, i, strchr,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __strchr_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, strchr,
(HAS_ARCH_FEATURE (AVX512VL_Usable)
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
&& HAS_CPU_FEATURE (BMI2)),
@@ -331,6 +392,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__strchrnul_avx2)
IFUNC_IMPL_ADD (array, i, strchrnul,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __strchrnul_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, strchrnul,
(HAS_ARCH_FEATURE (AVX512VL_Usable)
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
&& HAS_CPU_FEATURE (BMI2)),
@@ -343,6 +408,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__strrchr_avx2)
IFUNC_IMPL_ADD (array, i, strrchr,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __strrchr_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, strrchr,
(HAS_ARCH_FEATURE (AVX512VL_Usable)
&& HAS_ARCH_FEATURE (AVX512BW_Usable)),
__strrchr_evex)
@@ -354,6 +423,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__strcmp_avx2)
IFUNC_IMPL_ADD (array, i, strcmp,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __strcmp_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, strcmp,
(HAS_ARCH_FEATURE (AVX512VL_Usable)
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
&& HAS_CPU_FEATURE (BMI2)),
@@ -458,6 +531,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__wcschr_avx2)
IFUNC_IMPL_ADD (array, i, wcschr,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __wcschr_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, wcschr,
(HAS_ARCH_FEATURE (AVX512VL_Usable)
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
&& HAS_CPU_FEATURE (BMI2)),
@@ -470,6 +547,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__wcsrchr_avx2)
IFUNC_IMPL_ADD (array, i, wcsrchr,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __wcsrchr_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, wcsrchr,
(HAS_ARCH_FEATURE (AVX512VL_Usable)
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
&& HAS_CPU_FEATURE (BMI2)),
@@ -482,6 +563,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__wcscmp_avx2)
IFUNC_IMPL_ADD (array, i, wcscmp,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __wcscmp_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, wcscmp,
(HAS_ARCH_FEATURE (AVX512VL_Usable)
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
&& HAS_CPU_FEATURE (BMI2)),
@@ -494,6 +579,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__wcsncmp_avx2)
IFUNC_IMPL_ADD (array, i, wcsncmp,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __wcsncmp_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, wcsncmp,
(HAS_ARCH_FEATURE (AVX512VL_Usable)
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
&& HAS_CPU_FEATURE (BMI2)),
@@ -512,6 +601,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__wcslen_avx2)
IFUNC_IMPL_ADD (array, i, wcslen,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __wcslen_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, wcslen,
(HAS_ARCH_FEATURE (AVX512VL_Usable)
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
&& HAS_CPU_FEATURE (BMI2)),
@@ -524,6 +617,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__wcsnlen_avx2)
IFUNC_IMPL_ADD (array, i, wcsnlen,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __wcsnlen_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, wcsnlen,
(HAS_ARCH_FEATURE (AVX512VL_Usable)
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
&& HAS_CPU_FEATURE (BMI2)),
@@ -539,6 +636,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__wmemchr_avx2)
IFUNC_IMPL_ADD (array, i, wmemchr,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __wmemchr_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, wmemchr,
(HAS_ARCH_FEATURE (AVX512VL_Usable)
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
&& HAS_CPU_FEATURE (BMI2)),
@@ -552,6 +653,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& HAS_CPU_FEATURE (MOVBE)),
__wmemcmp_avx2_movbe)
IFUNC_IMPL_ADD (array, i, wmemcmp,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (MOVBE)
+ && HAS_CPU_FEATURE (RTM)),
+ __wmemcmp_avx2_movbe_rtm)
+ IFUNC_IMPL_ADD (array, i, wmemcmp,
(HAS_ARCH_FEATURE (AVX512VL_Usable)
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
&& HAS_CPU_FEATURE (MOVBE)),
@@ -570,6 +676,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__wmemset_avx2_unaligned)
IFUNC_IMPL_ADD (array, i, wmemset,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __wmemset_avx2_unaligned_rtm)
+ IFUNC_IMPL_ADD (array, i, wmemset,
HAS_ARCH_FEATURE (AVX512VL_Usable),
__wmemset_evex_unaligned)
IFUNC_IMPL_ADD (array, i, wmemset,
@@ -595,6 +705,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX_Usable),
__memcpy_chk_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memcpy_chk_avx_unaligned_rtm)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memcpy_chk_avx_unaligned_erms_rtm)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_ARCH_FEATURE (AVX512VL_Usable),
__memcpy_chk_evex_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
@@ -623,6 +741,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX_Usable),
__memcpy_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memcpy_avx_unaligned_rtm)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memcpy_avx_unaligned_erms_rtm)
+ IFUNC_IMPL_ADD (array, i, memcpy,
HAS_ARCH_FEATURE (AVX512VL_Usable),
__memcpy_evex_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy,
@@ -665,6 +791,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_chk_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __mempcpy_chk_avx_unaligned_rtm)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __mempcpy_chk_avx_unaligned_erms_rtm)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_ARCH_FEATURE (AVX512VL_Usable),
__mempcpy_chk_evex_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
@@ -702,6 +836,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __mempcpy_avx_unaligned_rtm)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __mempcpy_avx_unaligned_erms_rtm)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
HAS_ARCH_FEATURE (AVX512VL_Usable),
__mempcpy_evex_unaligned)
IFUNC_IMPL_ADD (array, i, mempcpy,
@@ -723,6 +865,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__strncmp_avx2)
IFUNC_IMPL_ADD (array, i, strncmp,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __strncmp_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, strncmp,
(HAS_ARCH_FEATURE (AVX512VL_Usable)
&& HAS_ARCH_FEATURE (AVX512BW_Usable)),
__strncmp_evex)
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index 34e0a1295f..c12a023a19 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -23,6 +23,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
static inline void *
@@ -38,6 +39,9 @@ IFUNC_SELECTOR (void)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable))
return OPTIMIZE (evex_movbe);
+ if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_movbe_rtm);
+
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx2_movbe);
}
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index 83db955826..fe003b28e1 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm)
+ attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
@@ -71,6 +75,14 @@ IFUNC_SELECTOR (void)
return OPTIMIZE (evex_unaligned);
}
+ if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ {
+ if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ return OPTIMIZE (avx_unaligned_erms_rtm);
+
+ return OPTIMIZE (avx_unaligned_rtm);
+ }
+
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
{
if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
index fea6c832f4..6fdf53ec1c 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms_rtm)
+ attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
@@ -69,6 +73,14 @@ IFUNC_SELECTOR (void)
return OPTIMIZE (evex_unaligned);
}
+ if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ {
+ if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ return OPTIMIZE (avx2_unaligned_erms_rtm);
+
+ return OPTIMIZE (avx2_unaligned_rtm);
+ }
+
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
{
if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
index fae721cdb0..091d691dc6 100644
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
@@ -20,6 +20,8 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
+ attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
@@ -39,6 +41,9 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable))
return OPTIMIZE (evex_unaligned);
+ if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_unaligned_rtm);
+
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx2_unaligned);
}
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
new file mode 100644
index 0000000000..87b076c7c4
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef MEMCHR
+# define MEMCHR __memchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "memchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index c81da19bf0..cf893e77b3 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -34,9 +34,13 @@
# define VZEROUPPER vzeroupper
# endif
+# ifndef SECTION
+# define SECTION(p) p##.avx
+# endif
+
# define VEC_SIZE 32
- .section .text.avx,"ax",@progbits
+ .section SECTION(.text),"ax",@progbits
ENTRY (MEMCHR)
# ifndef USE_AS_RAWMEMCHR
/* Check for zero length. */
@@ -107,8 +111,8 @@ L(cros_page_boundary):
# endif
addq %rdi, %rax
addq %rcx, %rax
- VZEROUPPER
- ret
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(aligned_more):
@@ -224,8 +228,7 @@ L(last_4x_vec_or_less):
jnz L(first_vec_x3_check)
xorl %eax, %eax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(last_2x_vec):
@@ -243,8 +246,7 @@ L(last_2x_vec):
testl %eax, %eax
jnz L(first_vec_x1_check)
xorl %eax, %eax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec_x0_check):
@@ -253,8 +255,7 @@ L(first_vec_x0_check):
cmpq %rax, %rdx
jbe L(zero)
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec_x1_check):
@@ -264,8 +265,7 @@ L(first_vec_x1_check):
jbe L(zero)
addq $VEC_SIZE, %rax
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec_x2_check):
@@ -275,8 +275,7 @@ L(first_vec_x2_check):
jbe L(zero)
addq $(VEC_SIZE * 2), %rax
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec_x3_check):
@@ -286,12 +285,14 @@ L(first_vec_x3_check):
jbe L(zero)
addq $(VEC_SIZE * 3), %rax
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(zero):
- VZEROUPPER
+ xorl %eax, %eax
+ jmp L(return_vzeroupper)
+
+ .p2align 4
L(null):
xorl %eax, %eax
ret
@@ -301,24 +302,21 @@ L(null):
L(first_vec_x0):
tzcntl %eax, %eax
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec_x1):
tzcntl %eax, %eax
addq $VEC_SIZE, %rax
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec_x2):
tzcntl %eax, %eax
addq $(VEC_SIZE * 2), %rax
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(4x_vec_end):
@@ -337,8 +335,7 @@ L(first_vec_x3):
tzcntl %eax, %eax
addq $(VEC_SIZE * 3), %rax
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
END (MEMCHR)
#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
new file mode 100644
index 0000000000..cf4eff5d4a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
@@ -0,0 +1,12 @@
+#ifndef MEMCMP
+# define MEMCMP __memcmp_avx2_movbe_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "memcmp-avx2-movbe.S"
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
index e3a35b899e..9d5c9c72b3 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -47,6 +47,10 @@
# define VZEROUPPER vzeroupper
# endif
+# ifndef SECTION
+# define SECTION(p) p##.avx
+# endif
+
# define VEC_SIZE 32
# define VEC_MASK ((1 << VEC_SIZE) - 1)
@@ -55,7 +59,7 @@
memcmp has to use UNSIGNED comparison for elemnts.
*/
- .section .text.avx,"ax",@progbits
+ .section SECTION(.text),"ax",@progbits
ENTRY (MEMCMP)
# ifdef USE_AS_WMEMCMP
shl $2, %RDX_LP
@@ -123,8 +127,8 @@ ENTRY (MEMCMP)
vptest %ymm0, %ymm5
jnc L(4x_vec_end)
xorl %eax, %eax
- VZEROUPPER
- ret
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(last_2x_vec):
@@ -144,8 +148,7 @@ L(last_vec):
vpmovmskb %ymm2, %eax
subl $VEC_MASK, %eax
jnz L(first_vec)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec):
@@ -164,8 +167,7 @@ L(wmemcmp_return):
movzbl (%rsi, %rcx), %edx
sub %edx, %eax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
# ifdef USE_AS_WMEMCMP
.p2align 4
@@ -367,8 +369,7 @@ L(last_4x_vec):
vpmovmskb %ymm2, %eax
subl $VEC_MASK, %eax
jnz L(first_vec)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(4x_vec_end):
@@ -394,8 +395,7 @@ L(4x_vec_end):
movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
sub %edx, %eax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec_x1):
@@ -410,8 +410,7 @@ L(first_vec_x1):
movzbl VEC_SIZE(%rsi, %rcx), %edx
sub %edx, %eax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec_x2):
@@ -426,7 +425,6 @@ L(first_vec_x2):
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
sub %edx, %eax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
END (MEMCMP)
#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
new file mode 100644
index 0000000000..1ec1962e86
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
@@ -0,0 +1,17 @@
+#if IS_IN (libc)
+# define VEC_SIZE 32
+# define VEC(i) ymm##i
+# define VMOVNT vmovntdq
+# define VMOVU vmovdqu
+# define VMOVA vmovdqa
+
+# define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+# define VZEROUPPER_RETURN jmp L(return)
+
+# define SECTION(p) p##.avx.rtm
+# define MEMMOVE_SYMBOL(p,s) p##_avx_##s##_rtm
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 50fffeb5ce..386624b3c4 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -158,11 +158,12 @@ L(last_2x_vec):
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
- VZEROUPPER
#if !defined USE_MULTIARCH || !IS_IN (libc)
L(nop):
-#endif
ret
+#else
+ VZEROUPPER_RETURN
+#endif
#if defined USE_MULTIARCH && IS_IN (libc)
END (MEMMOVE_SYMBOL (__memmove, unaligned))
@@ -255,8 +256,11 @@ L(last_2x_vec):
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
L(return):
- VZEROUPPER
+#if VEC_SIZE > 16
+ ZERO_UPPER_VEC_REGISTERS_RETURN
+#else
ret
+#endif
L(movsb):
cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
@@ -324,8 +328,7 @@ L(between_32_63):
VMOVU -32(%rsi,%rdx), %YMM1
VMOVU %YMM0, (%rdi)
VMOVU %YMM1, -32(%rdi,%rdx)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
#endif
#if VEC_SIZE > 16
/* From 16 to 31. No branch when size == 16. */
@@ -334,7 +337,7 @@ L(between_16_31):
VMOVU -16(%rsi,%rdx), %XMM1
VMOVU %XMM0, (%rdi)
VMOVU %XMM1, -16(%rdi,%rdx)
- ret
+ VZEROUPPER_RETURN
#endif
L(between_8_15):
/* From 8 to 15. No branch when size == 8. */
@@ -387,8 +390,7 @@ L(more_2x_vec):
VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
L(last_4x_vec):
/* Copy from 2 * VEC to 4 * VEC. */
VMOVU (%rsi), %VEC(0)
@@ -399,8 +401,7 @@ L(last_4x_vec):
VMOVU %VEC(1), VEC_SIZE(%rdi)
VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
L(more_8x_vec):
cmpq %rsi, %rdi
@@ -456,8 +457,7 @@ L(loop_4x_vec_forward):
VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
/* Store the first VEC. */
VMOVU %VEC(4), (%r11)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
L(more_8x_vec_backward):
/* Load the first 4 * VEC and last VEC to support overlapping
@@ -508,8 +508,7 @@ L(loop_4x_vec_backward):
VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
/* Store the last VEC. */
VMOVU %VEC(8), (%r11)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
L(large_forward):
@@ -544,8 +543,7 @@ L(loop_large_forward):
VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
/* Store the first VEC. */
VMOVU %VEC(4), (%r11)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
L(large_backward):
/* Don't use non-temporal store if there is overlap between
@@ -579,8 +577,7 @@ L(loop_large_backward):
VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
/* Store the last VEC. */
VMOVU %VEC(8), (%r11)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
#endif
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
new file mode 100644
index 0000000000..cea2d2a72d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef MEMRCHR
+# define MEMRCHR __memrchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "memrchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
index ce488dd9e8..20efe7ac7c 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -20,14 +20,22 @@
# include <sysdep.h>
+# ifndef MEMRCHR
+# define MEMRCHR __memrchr_avx2
+# endif
+
# ifndef VZEROUPPER
# define VZEROUPPER vzeroupper
# endif
+# ifndef SECTION
+# define SECTION(p) p##.avx
+# endif
+
# define VEC_SIZE 32
- .section .text.avx,"ax",@progbits
-ENTRY (__memrchr_avx2)
+ .section SECTION(.text),"ax",@progbits
+ENTRY (MEMRCHR)
/* Broadcast CHAR to YMM0. */
vmovd %esi, %xmm0
vpbroadcastb %xmm0, %ymm0
@@ -134,8 +142,8 @@ L(loop_4x_vec):
vpmovmskb %ymm1, %eax
bsrl %eax, %eax
addq %rdi, %rax
- VZEROUPPER
- ret
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(last_4x_vec_or_less):
@@ -169,8 +177,7 @@ L(last_4x_vec_or_less):
addq %rax, %rdx
jl L(zero)
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(last_2x_vec):
@@ -191,31 +198,27 @@ L(last_2x_vec):
jl L(zero)
addl $(VEC_SIZE * 2), %eax
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(last_vec_x0):
bsrl %eax, %eax
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(last_vec_x1):
bsrl %eax, %eax
addl $VEC_SIZE, %eax
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(last_vec_x2):
bsrl %eax, %eax
addl $(VEC_SIZE * 2), %eax
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(last_vec_x3):
@@ -232,8 +235,7 @@ L(last_vec_x1_check):
jl L(zero)
addl $VEC_SIZE, %eax
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(last_vec_x3_check):
@@ -243,12 +245,14 @@ L(last_vec_x3_check):
jl L(zero)
addl $(VEC_SIZE * 3), %eax
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(zero):
- VZEROUPPER
+ xorl %eax, %eax
+ VZEROUPPER_RETURN
+
+ .p2align 4
L(null):
xorl %eax, %eax
ret
@@ -273,8 +277,7 @@ L(last_vec_or_less_aligned):
bsrl %eax, %eax
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(last_vec_or_less):
@@ -315,8 +318,7 @@ L(last_vec_or_less):
bsrl %eax, %eax
addq %rdi, %rax
addq %r8, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(last_vec_2x_aligned):
@@ -353,7 +355,6 @@ L(last_vec_2x_aligned):
bsrl %eax, %eax
addq %rdi, %rax
addq %r8, %rax
- VZEROUPPER
- ret
-END (__memrchr_avx2)
+ VZEROUPPER_RETURN
+END (MEMRCHR)
#endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
new file mode 100644
index 0000000000..8ac3e479bb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
@@ -0,0 +1,10 @@
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return)
+
+#define SECTION(p) p##.avx.rtm
+#define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm
+#define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm
+
+#include "memset-avx2-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index 7ab3d89849..ae0860f36a 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -14,9 +14,15 @@
movq r, %rax; \
vpbroadcastd %xmm0, %ymm0
-# define SECTION(p) p##.avx
-# define MEMSET_SYMBOL(p,s) p##_avx2_##s
-# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
+# ifndef SECTION
+# define SECTION(p) p##.avx
+# endif
+# ifndef MEMSET_SYMBOL
+# define MEMSET_SYMBOL(p,s) p##_avx2_##s
+# endif
+# ifndef WMEMSET_SYMBOL
+# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
+# endif
# include "memset-vec-unaligned-erms.S"
#endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 9f14e956d1..7747bc5d8b 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -45,17 +45,14 @@
#ifndef VZEROUPPER
# if VEC_SIZE > 16
# define VZEROUPPER vzeroupper
+# define VZEROUPPER_SHORT_RETURN vzeroupper; ret
# else
# define VZEROUPPER
# endif
#endif
#ifndef VZEROUPPER_SHORT_RETURN
-# if VEC_SIZE > 16
-# define VZEROUPPER_SHORT_RETURN vzeroupper
-# else
-# define VZEROUPPER_SHORT_RETURN rep
-# endif
+# define VZEROUPPER_SHORT_RETURN rep; ret
#endif
#ifndef MOVQ
@@ -127,8 +124,7 @@ L(entry_from_bzero):
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(0), (%rdi)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
#if defined USE_MULTIARCH && IS_IN (libc)
END (MEMSET_SYMBOL (__memset, unaligned))
@@ -151,14 +147,12 @@ ENTRY (__memset_erms)
ENTRY (MEMSET_SYMBOL (__memset, erms))
# endif
L(stosb):
- /* Issue vzeroupper before rep stosb. */
- VZEROUPPER
mov %RDX_LP, %RCX_LP
movzbl %sil, %eax
mov %RDI_LP, %RDX_LP
rep stosb
mov %RDX_LP, %RAX_LP
- ret
+ VZEROUPPER_RETURN
# if VEC_SIZE == 16
END (__memset_erms)
# else
@@ -185,8 +179,7 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(0), (%rdi)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
L(stosb_more_2x_vec):
cmpq $REP_STOSB_THRESHOLD, %rdx
@@ -200,8 +193,11 @@ L(more_2x_vec):
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
L(return):
- VZEROUPPER
+#if VEC_SIZE > 16
+ ZERO_UPPER_VEC_REGISTERS_RETURN
+#else
ret
+#endif
L(loop_start):
leaq (VEC_SIZE * 4)(%rdi), %rcx
@@ -227,7 +223,6 @@ L(loop):
cmpq %rcx, %rdx
jne L(loop)
VZEROUPPER_SHORT_RETURN
- ret
L(less_vec):
/* Less than 1 VEC. */
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
@@ -251,40 +246,34 @@ L(less_vec):
jb 1f
movb %cl, (%rdi)
1:
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
# if VEC_SIZE > 32
/* From 32 to 63. No branch when size == 32. */
L(between_32_63):
VMOVU %YMM0, -32(%rdi,%rdx)
VMOVU %YMM0, (%rdi)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
# endif
# if VEC_SIZE > 16
/* From 16 to 31. No branch when size == 16. */
L(between_16_31):
VMOVU %XMM0, -16(%rdi,%rdx)
VMOVU %XMM0, (%rdi)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
# endif
/* From 8 to 15. No branch when size == 8. */
L(between_8_15):
movq %rcx, -8(%rdi,%rdx)
movq %rcx, (%rdi)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
L(between_4_7):
/* From 4 to 7. No branch when size == 4. */
movl %ecx, -4(%rdi,%rdx)
movl %ecx, (%rdi)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
L(between_2_3):
/* From 2 to 3. No branch when size == 2. */
movw %cx, -2(%rdi,%rdx)
movw %cx, (%rdi)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
END (MEMSET_SYMBOL (__memset, unaligned_erms))
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
new file mode 100644
index 0000000000..acc5f6e2fb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define MEMCHR __rawmemchr_avx2_rtm
+#define USE_AS_RAWMEMCHR 1
+
+#include "memchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
new file mode 100644
index 0000000000..60a2ccfe53
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_avx2_rtm
+#include "strcpy-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
new file mode 100644
index 0000000000..637fb557c4
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRCAT
+# define STRCAT __strcat_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
new file mode 100644
index 0000000000..81f20d1d8e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRCHR
+# define STRCHR __strchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index 47bc3c9949..da7d262065 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -38,9 +38,13 @@
# define VZEROUPPER vzeroupper
# endif
+# ifndef SECTION
+# define SECTION(p) p##.avx
+# endif
+
# define VEC_SIZE 32
- .section .text.avx,"ax",@progbits
+ .section SECTION(.text),"ax",@progbits
ENTRY (STRCHR)
movl %edi, %ecx
/* Broadcast CHAR to YMM0. */
@@ -93,8 +97,8 @@ L(cros_page_boundary):
cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
- VZEROUPPER
- ret
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(aligned_more):
@@ -190,8 +194,7 @@ L(first_vec_x0):
cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec_x1):
@@ -205,8 +208,7 @@ L(first_vec_x1):
cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec_x2):
@@ -220,8 +222,7 @@ L(first_vec_x2):
cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(4x_vec_end):
@@ -247,8 +248,7 @@ L(first_vec_x3):
cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
END (STRCHR)
#endif
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
index efe6584076..9bae2099d9 100644
--- a/sysdeps/x86_64/multiarch/strchr.c
+++ b/sysdeps/x86_64/multiarch/strchr.c
@@ -29,6 +29,7 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
static inline void *
@@ -44,6 +45,9 @@ IFUNC_SELECTOR (void)
&& CPU_FEATURES_CPU_P (cpu_features, BMI2))
return OPTIMIZE (evex);
+ if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx2);
}
diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
new file mode 100644
index 0000000000..cdcf818b91
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define STRCHR __strchrnul_avx2_rtm
+#define USE_AS_STRCHRNUL 1
+#include "strchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
new file mode 100644
index 0000000000..aecd30d97f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRCMP
+# define STRCMP __strcmp_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 8fb8eedcde..5d1c9d9018 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -55,6 +55,10 @@
# define VZEROUPPER vzeroupper
# endif
+# ifndef SECTION
+# define SECTION(p) p##.avx
+# endif
+
/* Warning!
wcscmp/wcsncmp have to use SIGNED comparison for elements.
strcmp/strncmp have to use UNSIGNED comparison for elements.
@@ -75,7 +79,7 @@
the maximum offset is reached before a difference is found, zero is
returned. */
- .section .text.avx,"ax",@progbits
+ .section SECTION(.text),"ax",@progbits
ENTRY (STRCMP)
# ifdef USE_AS_STRNCMP
/* Check for simple cases (0 or 1) in offset. */
@@ -137,8 +141,8 @@ L(return):
movzbl (%rsi, %rdx), %edx
subl %edx, %eax
# endif
- VZEROUPPER
- ret
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(return_vec_size):
@@ -171,8 +175,7 @@ L(return_vec_size):
subl %edx, %eax
# endif
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(return_2_vec_size):
@@ -205,8 +208,7 @@ L(return_2_vec_size):
subl %edx, %eax
# endif
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(return_3_vec_size):
@@ -239,8 +241,7 @@ L(return_3_vec_size):
subl %edx, %eax
# endif
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(next_3_vectors):
@@ -366,8 +367,7 @@ L(back_to_loop):
subl %edx, %eax
# endif
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(test_vec):
@@ -410,8 +410,7 @@ L(test_vec):
subl %edx, %eax
# endif
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(test_2_vec):
@@ -454,8 +453,7 @@ L(test_2_vec):
subl %edx, %eax
# endif
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(test_3_vec):
@@ -496,8 +494,7 @@ L(test_3_vec):
subl %edx, %eax
# endif
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(loop_cross_page):
@@ -566,8 +563,7 @@ L(loop_cross_page):
subl %edx, %eax
# endif
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(loop_cross_page_2_vec):
@@ -641,8 +637,7 @@ L(loop_cross_page_2_vec):
subl %edx, %eax
# endif
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
# ifdef USE_AS_STRNCMP
L(string_nbyte_offset_check):
@@ -684,8 +679,7 @@ L(cross_page_loop):
# ifndef USE_AS_WCSCMP
L(different):
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
# ifdef USE_AS_WCSCMP
.p2align 4
@@ -695,16 +689,14 @@ L(different):
setl %al
negl %eax
orl $1, %eax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
# endif
# ifdef USE_AS_STRNCMP
.p2align 4
L(zero):
xorl %eax, %eax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(char0):
@@ -718,8 +710,7 @@ L(char0):
movzbl (%rdi), %eax
subl %ecx, %eax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
# endif
.p2align 4
@@ -744,8 +735,7 @@ L(last_vector):
movzbl (%rsi, %rdx), %edx
subl %edx, %eax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
/* Comparing on page boundary region requires special treatment:
It must done one vector at the time, starting with the wider
@@ -866,7 +856,6 @@ L(cross_page_4bytes):
testl %eax, %eax
jne L(cross_page_loop)
subl %ecx, %eax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
END (STRCMP)
#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
index e947cefb08..c99c08aa3f 100644
--- a/sysdeps/x86_64/multiarch/strcmp.c
+++ b/sysdeps/x86_64/multiarch/strcmp.c
@@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
static inline void *
@@ -46,6 +47,9 @@ IFUNC_SELECTOR (void)
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
return OPTIMIZE (evex);
+ if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx2);
}
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
new file mode 100644
index 0000000000..c2c581ecf7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRCPY
+# define STRCPY __strcpy_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S
new file mode 100644
index 0000000000..75b4b7612c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRLEN
+# define STRLEN __strlen_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strlen-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index 645e04461f..82826e1098 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -36,9 +36,13 @@
# define VZEROUPPER vzeroupper
# endif
+# ifndef SECTION
+# define SECTION(p) p##.avx
+# endif
+
# define VEC_SIZE 32
- .section .text.avx,"ax",@progbits
+ .section SECTION(.text),"ax",@progbits
ENTRY (STRLEN)
# ifdef USE_AS_STRNLEN
/* Check for zero length. */
@@ -111,8 +115,8 @@ L(cros_page_boundary):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
- VZEROUPPER
- ret
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(aligned_more):
@@ -231,8 +235,7 @@ L(last_4x_vec_or_less):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(last_2x_vec):
@@ -253,8 +256,7 @@ L(last_2x_vec):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec_x0_check):
@@ -267,8 +269,7 @@ L(first_vec_x0_check):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec_x1_check):
@@ -282,8 +283,7 @@ L(first_vec_x1_check):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec_x2_check):
@@ -297,8 +297,7 @@ L(first_vec_x2_check):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec_x3_check):
@@ -312,8 +311,7 @@ L(first_vec_x3_check):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(max):
@@ -321,8 +319,7 @@ L(max):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(zero):
@@ -338,8 +335,7 @@ L(first_vec_x0):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec_x1):
@@ -350,8 +346,7 @@ L(first_vec_x1):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec_x2):
@@ -362,8 +357,7 @@ L(first_vec_x2):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(4x_vec_end):
@@ -389,8 +383,7 @@ L(first_vec_x3):
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
END (STRLEN)
#endif
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
new file mode 100644
index 0000000000..0dcea18dbb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_avx2_rtm
+#include "strcat-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
new file mode 100644
index 0000000000..37d1224bb9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define STRCMP __strncmp_avx2_rtm
+#define USE_AS_STRNCMP 1
+#include "strcmp-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
index 4069946f80..880e39659f 100644
--- a/sysdeps/x86_64/multiarch/strncmp.c
+++ b/sysdeps/x86_64/multiarch/strncmp.c
@@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
static inline void *
@@ -46,6 +47,9 @@ IFUNC_SELECTOR (void)
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
return OPTIMIZE (evex);
+ if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx2);
}
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
new file mode 100644
index 0000000000..79e7083299
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_avx2_rtm
+#include "strcpy-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S
new file mode 100644
index 0000000000..04f1626a5c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_avx2_rtm
+#define USE_AS_STRNLEN 1
+
+#include "strlen-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S
new file mode 100644
index 0000000000..5def14ec1c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRRCHR
+# define STRRCHR __strrchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strrchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
index 4381e6ab3e..9f22a15e25 100644
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -36,9 +36,13 @@
# define VZEROUPPER vzeroupper
# endif
+# ifndef SECTION
+# define SECTION(p) p##.avx
+# endif
+
# define VEC_SIZE 32
- .section .text.avx,"ax",@progbits
+ .section SECTION(.text),"ax",@progbits
ENTRY (STRRCHR)
movd %esi, %xmm4
movl %edi, %ecx
@@ -166,8 +170,8 @@ L(return_value):
# endif
bsrl %eax, %eax
leaq -VEC_SIZE(%rdi, %rax), %rax
- VZEROUPPER
- ret
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(match):
@@ -198,8 +202,7 @@ L(find_nul):
jz L(return_value)
bsrl %eax, %eax
leaq -VEC_SIZE(%rdi, %rax), %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(char_and_nul):
@@ -222,14 +225,12 @@ L(char_and_nul_in_first_vec):
jz L(return_null)
bsrl %eax, %eax
leaq -VEC_SIZE(%rdi, %rax), %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(return_null):
xorl %eax, %eax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
END (STRRCHR)
#endif
diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S
new file mode 100644
index 0000000000..d49dbbf0b4
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define STRCHR __wcschr_avx2_rtm
+#define USE_AS_WCSCHR 1
+#include "strchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S
new file mode 100644
index 0000000000..d6ca2b8064
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define STRCMP __wcscmp_avx2_rtm
+#define USE_AS_WCSCMP 1
+
+#include "strcmp-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S
new file mode 100644
index 0000000000..35658d7365
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_avx2_rtm
+#define USE_AS_WCSLEN 1
+
+#include "strlen-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
new file mode 100644
index 0000000000..4e88c70cc6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
@@ -0,0 +1,5 @@
+#define STRCMP __wcsncmp_avx2_rtm
+#define USE_AS_STRNCMP 1
+#define USE_AS_WCSCMP 1
+
+#include "strcmp-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S
new file mode 100644
index 0000000000..7437ebee2d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_avx2_rtm
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
index f0bcfd180d..95f3cfb2cf 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen.c
+++ b/sysdeps/x86_64/multiarch/wcsnlen.c
@@ -29,6 +29,7 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
static inline void *
@@ -44,6 +45,9 @@ IFUNC_SELECTOR (void)
&& CPU_FEATURES_CPU_P (cpu_features, BMI2))
return OPTIMIZE (evex);
+ if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx2);
}
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S
new file mode 100644
index 0000000000..9bf760833f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define STRRCHR __wcsrchr_avx2_rtm
+#define USE_AS_WCSRCHR 1
+#include "strrchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S
new file mode 100644
index 0000000000..58ed21db01
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define MEMCHR __wmemchr_avx2_rtm
+#define USE_AS_WMEMCHR 1
+
+#include "memchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S
new file mode 100644
index 0000000000..31104d1215
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S
@@ -0,0 +1,4 @@
+#define MEMCMP __wmemcmp_avx2_movbe_rtm
+#define USE_AS_WMEMCMP 1
+
+#include "memcmp-avx2-movbe-rtm.S"
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index 1738d7f955..223f1a5949 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -95,6 +95,28 @@ lose: \
#define R14_LP r14
#define R15_LP r15
+/* Zero upper vector registers and return with xtest. NB: Use VZEROALL
+ to avoid RTM abort triggered by VZEROUPPER inside transactionally. */
+#define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \
+ xtest; \
+ jz 1f; \
+ vzeroall; \
+ ret; \
+1: \
+ vzeroupper; \
+ ret
+
+/* Zero upper vector registers and return. */
+#ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
+# define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ VZEROUPPER; \
+ ret
+#endif
+
+#ifndef VZEROUPPER_RETURN
+# define VZEROUPPER_RETURN VZEROUPPER; ret
+#endif
+
#else /* __ASSEMBLER__ */
/* Long and pointer size in bytes. */