From 5f59aaddc9be84a6ce8c71cab90ee08dbbd72af5 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 5 Mar 2021 07:26:42 -0800 Subject: x86-64: Add AVX optimized string/memory functions for RTM Since VZEROUPPER triggers RTM abort while VZEROALL won't, select AVX optimized string/memory functions with xtest jz 1f vzeroall ret 1: vzeroupper ret at function exit on processors with usable RTM, but without 256-bit EVEX instructions to avoid VZEROUPPER inside a transactionally executing RTM region. (cherry picked from commit 7ebba91361badf7531d4e75050627a88d424872f) --- sysdeps/x86_64/multiarch/Makefile | 21 +++ sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 146 +++++++++++++++++++++ sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 + sysdeps/x86_64/multiarch/ifunc-memmove.h | 12 ++ sysdeps/x86_64/multiarch/ifunc-memset.h | 12 ++ sysdeps/x86_64/multiarch/ifunc-wmemset.h | 5 + sysdeps/x86_64/multiarch/memchr-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/memchr-avx2.S | 45 +++---- sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S | 12 ++ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 28 ++-- .../multiarch/memmove-avx-unaligned-erms-rtm.S | 17 +++ .../x86_64/multiarch/memmove-vec-unaligned-erms.S | 33 +++-- sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/memrchr-avx2.S | 53 ++++---- .../multiarch/memset-avx2-unaligned-erms-rtm.S | 10 ++ .../x86_64/multiarch/memset-avx2-unaligned-erms.S | 12 +- .../x86_64/multiarch/memset-vec-unaligned-erms.S | 41 +++--- sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S | 4 + sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S | 4 + sysdeps/x86_64/multiarch/strcat-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/strchr-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/strchr-avx2.S | 22 ++-- sysdeps/x86_64/multiarch/strchr.c | 4 + sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S | 3 + sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/strcmp-avx2.S | 55 ++++---- sysdeps/x86_64/multiarch/strcmp.c | 4 + sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/strlen-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/strlen-avx2.S | 43 +++--- sysdeps/x86_64/multiarch/strncat-avx2-rtm.S | 3 + sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S | 3 + sysdeps/x86_64/multiarch/strncmp.c | 4 + sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S | 3 + sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S | 4 + sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/strrchr-avx2.S | 19 +-- sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S | 3 + sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S | 4 + sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S | 4 + sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S | 5 + sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S | 5 + sysdeps/x86_64/multiarch/wcsnlen.c | 4 + sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S | 3 + sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S | 4 + sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S | 4 + sysdeps/x86_64/sysdep.h | 22 ++++ 48 files changed, 594 insertions(+), 190 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/memchr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S create mode 100644 sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S create mode 100644 sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strcat-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strchr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strlen-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strncat-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 141585a984..49fe1aaef7 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -41,6 +41,19 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ memset-sse2-unaligned-erms \ memset-avx2-unaligned-erms \ memset-avx512-unaligned-erms \ + memchr-avx2-rtm \ + memcmp-avx2-movbe-rtm \ + memmove-avx-unaligned-erms-rtm \ + memrchr-avx2-rtm \ + memset-avx2-unaligned-erms-rtm \ + rawmemchr-avx2-rtm \ + strchr-avx2-rtm \ + strcmp-avx2-rtm \ + strchrnul-avx2-rtm \ + strlen-avx2-rtm \ + strncmp-avx2-rtm \ + strnlen-avx2-rtm \ + strrchr-avx2-rtm \ memchr-evex \ memcmp-evex-movbe \ memmove-evex-unaligned-erms \ @@ -77,6 +90,14 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ wcsrchr-sse2 wcsrchr-avx2 \ wcsnlen-sse4_1 wcsnlen-c \ wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \ + wcschr-avx2-rtm \ + wcscmp-avx2-rtm \ + wcslen-avx2-rtm \ + wcsncmp-avx2-rtm \ + wcsnlen-avx2-rtm \ + wcsrchr-avx2-rtm \ + wmemchr-avx2-rtm \ + wmemcmp-avx2-movbe-rtm \ wcschr-evex \ wcscmp-evex \ wcslen-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h index d7875db6e2..348d3d0531 100644 --- a/sysdeps/x86_64/multiarch/ifunc-avx2.h +++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h @@ -21,6 +21,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * @@ -36,6 +37,9 @@ IFUNC_SELECTOR (void) && CPU_FEATURES_CPU_P (cpu_features, BMI2)) return OPTIMIZE (evex); + if (CPU_FEATURES_CPU_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2); } diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 13b0599bcc..8bc42e7813 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -43,6 +43,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memchr, HAS_ARCH_FEATURE (AVX2_Usable), __memchr_avx2) + IFUNC_IMPL_ADD (array, i, memchr, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __memchr_avx2_rtm) IFUNC_IMPL_ADD (array, i, memchr, (HAS_ARCH_FEATURE (AVX512VL_Usable) && HAS_ARCH_FEATURE (AVX512BW_Usable) @@ -56,6 +60,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (HAS_ARCH_FEATURE (AVX2_Usable) && HAS_CPU_FEATURE (MOVBE)), __memcmp_avx2_movbe) + IFUNC_IMPL_ADD (array, i, memcmp, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (MOVBE) + && HAS_CPU_FEATURE (RTM)), + __memcmp_avx2_movbe_rtm) IFUNC_IMPL_ADD (array, i, memcmp, (HAS_ARCH_FEATURE (AVX512VL_Usable) && HAS_ARCH_FEATURE (AVX512BW_Usable) @@ -85,6 +94,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_ARCH_FEATURE (AVX_Usable), __memmove_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __memmove_chk_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __memmove_chk_avx_unaligned_erms_rtm) IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_ARCH_FEATURE (AVX512VL_Usable), __memmove_chk_evex_unaligned) @@ -113,6 +130,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memmove, HAS_ARCH_FEATURE (AVX_Usable), __memmove_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memmove, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __memmove_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, memmove, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __memmove_avx_unaligned_erms_rtm) IFUNC_IMPL_ADD (array, i, memmove, HAS_ARCH_FEATURE (AVX512VL_Usable), __memmove_evex_unaligned) @@ -143,6 +168,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memrchr, HAS_ARCH_FEATURE (AVX2_Usable), __memrchr_avx2) + IFUNC_IMPL_ADD (array, i, memrchr, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __memrchr_avx2_rtm) IFUNC_IMPL_ADD (array, i, memrchr, (HAS_ARCH_FEATURE (AVX512VL_Usable) && HAS_ARCH_FEATURE (AVX512BW_Usable)), @@ -165,6 +194,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memset_chk, HAS_ARCH_FEATURE (AVX2_Usable), __memset_chk_avx2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __memset_chk_avx2_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __memset_chk_avx2_unaligned_erms_rtm) IFUNC_IMPL_ADD (array, i, __memset_chk, (HAS_ARCH_FEATURE (AVX512VL_Usable) && HAS_ARCH_FEATURE (AVX512BW_Usable)), @@ -198,6 +235,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memset, HAS_ARCH_FEATURE (AVX2_Usable), __memset_avx2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __memset_avx2_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, memset, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __memset_avx2_unaligned_erms_rtm) IFUNC_IMPL_ADD (array, i, memset, (HAS_ARCH_FEATURE (AVX512VL_Usable) && HAS_ARCH_FEATURE (AVX512BW_Usable)), @@ -222,6 +267,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_ARCH_FEATURE (AVX2_Usable), __rawmemchr_avx2) + IFUNC_IMPL_ADD (array, i, rawmemchr, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __rawmemchr_avx2_rtm) IFUNC_IMPL_ADD (array, i, rawmemchr, (HAS_ARCH_FEATURE (AVX512VL_Usable) && HAS_ARCH_FEATURE (AVX512BW_Usable) @@ -234,6 +283,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strlen, HAS_ARCH_FEATURE (AVX2_Usable), __strlen_avx2) + IFUNC_IMPL_ADD (array, i, strlen, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __strlen_avx2_rtm) IFUNC_IMPL_ADD (array, i, strlen, (HAS_ARCH_FEATURE (AVX512VL_Usable) && HAS_ARCH_FEATURE (AVX512BW_Usable)), @@ -245,6 +298,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strnlen, HAS_ARCH_FEATURE (AVX2_Usable), __strnlen_avx2) + IFUNC_IMPL_ADD (array, i, strnlen, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __strnlen_avx2_rtm) IFUNC_IMPL_ADD (array, i, strnlen, (HAS_ARCH_FEATURE (AVX512VL_Usable) && HAS_ARCH_FEATURE (AVX512BW_Usable)), @@ -317,6 +374,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strchr, HAS_ARCH_FEATURE (AVX2_Usable), __strchr_avx2) + IFUNC_IMPL_ADD (array, i, strchr, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __strchr_avx2_rtm) IFUNC_IMPL_ADD (array, i, strchr, (HAS_ARCH_FEATURE (AVX512VL_Usable) && HAS_ARCH_FEATURE (AVX512BW_Usable) @@ -330,6 +391,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strchrnul, HAS_ARCH_FEATURE (AVX2_Usable), __strchrnul_avx2) + IFUNC_IMPL_ADD (array, i, strchrnul, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __strchrnul_avx2_rtm) IFUNC_IMPL_ADD (array, i, strchrnul, (HAS_ARCH_FEATURE (AVX512VL_Usable) && HAS_ARCH_FEATURE (AVX512BW_Usable) @@ -342,6 +407,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strrchr, HAS_ARCH_FEATURE (AVX2_Usable), __strrchr_avx2) + IFUNC_IMPL_ADD (array, i, strrchr, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __strrchr_avx2_rtm) IFUNC_IMPL_ADD (array, i, strrchr, (HAS_ARCH_FEATURE (AVX512VL_Usable) && HAS_ARCH_FEATURE (AVX512BW_Usable)), @@ -353,6 +422,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcmp, HAS_ARCH_FEATURE (AVX2_Usable), __strcmp_avx2) + IFUNC_IMPL_ADD (array, i, strcmp, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __strcmp_avx2_rtm) IFUNC_IMPL_ADD (array, i, strcmp, (HAS_ARCH_FEATURE (AVX512VL_Usable) && HAS_ARCH_FEATURE (AVX512BW_Usable) @@ -457,6 +530,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcschr, HAS_ARCH_FEATURE (AVX2_Usable), __wcschr_avx2) + IFUNC_IMPL_ADD (array, i, wcschr, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __wcschr_avx2_rtm) IFUNC_IMPL_ADD (array, i, wcschr, (HAS_ARCH_FEATURE (AVX512VL_Usable) && HAS_ARCH_FEATURE (AVX512BW_Usable) @@ -469,6 +546,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcsrchr, HAS_ARCH_FEATURE (AVX2_Usable), __wcsrchr_avx2) + IFUNC_IMPL_ADD (array, i, wcsrchr, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __wcsrchr_avx2_rtm) IFUNC_IMPL_ADD (array, i, wcsrchr, (HAS_ARCH_FEATURE (AVX512VL_Usable) && HAS_ARCH_FEATURE (AVX512BW_Usable) @@ -481,6 +562,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcscmp, HAS_ARCH_FEATURE (AVX2_Usable), __wcscmp_avx2) + IFUNC_IMPL_ADD (array, i, wcscmp, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __wcscmp_avx2_rtm) IFUNC_IMPL_ADD (array, i, wcscmp, (HAS_ARCH_FEATURE (AVX512VL_Usable) && HAS_ARCH_FEATURE (AVX512BW_Usable) @@ -493,6 +578,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcsncmp, HAS_ARCH_FEATURE (AVX2_Usable), __wcsncmp_avx2) + IFUNC_IMPL_ADD (array, i, wcsncmp, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __wcsncmp_avx2_rtm) IFUNC_IMPL_ADD (array, i, wcsncmp, (HAS_ARCH_FEATURE (AVX512VL_Usable) && HAS_ARCH_FEATURE (AVX512BW_Usable) @@ -511,6 +600,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcslen, HAS_ARCH_FEATURE (AVX2_Usable), __wcslen_avx2) + IFUNC_IMPL_ADD (array, i, wcslen, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __wcslen_avx2_rtm) IFUNC_IMPL_ADD (array, i, wcslen, (HAS_ARCH_FEATURE (AVX512VL_Usable) && HAS_ARCH_FEATURE (AVX512BW_Usable) @@ -523,6 +616,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcsnlen, HAS_ARCH_FEATURE (AVX2_Usable), __wcsnlen_avx2) + IFUNC_IMPL_ADD (array, i, wcsnlen, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __wcsnlen_avx2_rtm) IFUNC_IMPL_ADD (array, i, wcsnlen, (HAS_ARCH_FEATURE (AVX512VL_Usable) && HAS_ARCH_FEATURE (AVX512BW_Usable) @@ -538,6 +635,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wmemchr, HAS_ARCH_FEATURE (AVX2_Usable), __wmemchr_avx2) + IFUNC_IMPL_ADD (array, i, wmemchr, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __wmemchr_avx2_rtm) IFUNC_IMPL_ADD (array, i, wmemchr, (HAS_ARCH_FEATURE (AVX512VL_Usable) && HAS_ARCH_FEATURE (AVX512BW_Usable) @@ -551,6 +652,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (HAS_ARCH_FEATURE (AVX2_Usable) && HAS_CPU_FEATURE (MOVBE)), __wmemcmp_avx2_movbe) + IFUNC_IMPL_ADD (array, i, wmemcmp, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (MOVBE) + && HAS_CPU_FEATURE (RTM)), + __wmemcmp_avx2_movbe_rtm) IFUNC_IMPL_ADD (array, i, wmemcmp, (HAS_ARCH_FEATURE (AVX512VL_Usable) && HAS_ARCH_FEATURE (AVX512BW_Usable) @@ -569,6 +675,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wmemset, HAS_ARCH_FEATURE (AVX2_Usable), __wmemset_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __wmemset_avx2_unaligned_rtm) IFUNC_IMPL_ADD (array, i, wmemset, HAS_ARCH_FEATURE (AVX512VL_Usable), __wmemset_evex_unaligned) @@ -594,6 +704,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_ARCH_FEATURE (AVX_Usable), __memcpy_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __memcpy_chk_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __memcpy_chk_avx_unaligned_erms_rtm) IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_ARCH_FEATURE (AVX512VL_Usable), __memcpy_chk_evex_unaligned) @@ -622,6 +740,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, HAS_ARCH_FEATURE (AVX_Usable), __memcpy_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memcpy, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __memcpy_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, memcpy, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __memcpy_avx_unaligned_erms_rtm) IFUNC_IMPL_ADD (array, i, memcpy, HAS_ARCH_FEATURE (AVX512VL_Usable), __memcpy_evex_unaligned) @@ -664,6 +790,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_ARCH_FEATURE (AVX_Usable), __mempcpy_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __mempcpy_chk_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __mempcpy_chk_avx_unaligned_erms_rtm) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_ARCH_FEATURE (AVX512VL_Usable), __mempcpy_chk_evex_unaligned) @@ -701,6 +835,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, mempcpy, HAS_ARCH_FEATURE (AVX_Usable), __mempcpy_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, mempcpy, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __mempcpy_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, mempcpy, + (HAS_ARCH_FEATURE (AVX_Usable) + && HAS_CPU_FEATURE (RTM)), + __mempcpy_avx_unaligned_erms_rtm) IFUNC_IMPL_ADD (array, i, mempcpy, HAS_ARCH_FEATURE (AVX512VL_Usable), __mempcpy_evex_unaligned) @@ -722,6 +864,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strncmp, HAS_ARCH_FEATURE (AVX2_Usable), __strncmp_avx2) + IFUNC_IMPL_ADD (array, i, strncmp, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (RTM)), + __strncmp_avx2_rtm) IFUNC_IMPL_ADD (array, i, strncmp, (HAS_ARCH_FEATURE (AVX512VL_Usable) && HAS_ARCH_FEATURE (AVX512BW_Usable)), diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h index 34e0a1295f..c12a023a19 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h @@ -23,6 +23,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden; static inline void * @@ -38,6 +39,9 @@ IFUNC_SELECTOR (void) && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable)) return OPTIMIZE (evex_movbe); + if (CPU_FEATURES_CPU_P (cpu_features, RTM)) + return OPTIMIZE (avx2_movbe_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2_movbe); } diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h index 83db955826..fe003b28e1 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h @@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm) + attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms) @@ -71,6 +75,14 @@ IFUNC_SELECTOR (void) return OPTIMIZE (evex_unaligned); } + if (CPU_FEATURES_CPU_P (cpu_features, RTM)) + { + if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) + return OPTIMIZE (avx_unaligned_erms_rtm); + + return OPTIMIZE (avx_unaligned_rtm); + } + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) { if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h index fea6c832f4..6fdf53ec1c 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memset.h +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h @@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms_rtm) + attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms) @@ -69,6 +73,14 @@ IFUNC_SELECTOR (void) return OPTIMIZE (evex_unaligned); } + if (CPU_FEATURES_CPU_P (cpu_features, RTM)) + { + if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) + return OPTIMIZE (avx2_unaligned_erms_rtm); + + return OPTIMIZE (avx2_unaligned_rtm); + } + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) { if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h index fae721cdb0..091d691dc6 100644 --- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h +++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h @@ -20,6 +20,8 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm) + attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; @@ -39,6 +41,9 @@ IFUNC_SELECTOR (void) if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)) return OPTIMIZE (evex_unaligned); + if (CPU_FEATURES_CPU_P (cpu_features, RTM)) + return OPTIMIZE (avx2_unaligned_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2_unaligned); } diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S new file mode 100644 index 0000000000..87b076c7c4 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef MEMCHR +# define MEMCHR __memchr_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "memchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S index c81da19bf0..cf893e77b3 100644 --- a/sysdeps/x86_64/multiarch/memchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S @@ -34,9 +34,13 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (MEMCHR) # ifndef USE_AS_RAWMEMCHR /* Check for zero length. */ @@ -107,8 +111,8 @@ L(cros_page_boundary): # endif addq %rdi, %rax addq %rcx, %rax - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(aligned_more): @@ -224,8 +228,7 @@ L(last_4x_vec_or_less): jnz L(first_vec_x3_check) xorl %eax, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_2x_vec): @@ -243,8 +246,7 @@ L(last_2x_vec): testl %eax, %eax jnz L(first_vec_x1_check) xorl %eax, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x0_check): @@ -253,8 +255,7 @@ L(first_vec_x0_check): cmpq %rax, %rdx jbe L(zero) addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1_check): @@ -264,8 +265,7 @@ L(first_vec_x1_check): jbe L(zero) addq $VEC_SIZE, %rax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2_check): @@ -275,8 +275,7 @@ L(first_vec_x2_check): jbe L(zero) addq $(VEC_SIZE * 2), %rax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x3_check): @@ -286,12 +285,14 @@ L(first_vec_x3_check): jbe L(zero) addq $(VEC_SIZE * 3), %rax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(zero): - VZEROUPPER + xorl %eax, %eax + jmp L(return_vzeroupper) + + .p2align 4 L(null): xorl %eax, %eax ret @@ -301,24 +302,21 @@ L(null): L(first_vec_x0): tzcntl %eax, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1): tzcntl %eax, %eax addq $VEC_SIZE, %rax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2): tzcntl %eax, %eax addq $(VEC_SIZE * 2), %rax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(4x_vec_end): @@ -337,8 +335,7 @@ L(first_vec_x3): tzcntl %eax, %eax addq $(VEC_SIZE * 3), %rax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN END (MEMCHR) #endif diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S new file mode 100644 index 0000000000..cf4eff5d4a --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S @@ -0,0 +1,12 @@ +#ifndef MEMCMP +# define MEMCMP __memcmp_avx2_movbe_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "memcmp-avx2-movbe.S" diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S index e3a35b899e..9d5c9c72b3 100644 --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S @@ -47,6 +47,10 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 # define VEC_MASK ((1 << VEC_SIZE) - 1) @@ -55,7 +59,7 @@ memcmp has to use UNSIGNED comparison for elemnts. */ - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (MEMCMP) # ifdef USE_AS_WMEMCMP shl $2, %RDX_LP @@ -123,8 +127,8 @@ ENTRY (MEMCMP) vptest %ymm0, %ymm5 jnc L(4x_vec_end) xorl %eax, %eax - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(last_2x_vec): @@ -144,8 +148,7 @@ L(last_vec): vpmovmskb %ymm2, %eax subl $VEC_MASK, %eax jnz L(first_vec) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec): @@ -164,8 +167,7 @@ L(wmemcmp_return): movzbl (%rsi, %rcx), %edx sub %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifdef USE_AS_WMEMCMP .p2align 4 @@ -367,8 +369,7 @@ L(last_4x_vec): vpmovmskb %ymm2, %eax subl $VEC_MASK, %eax jnz L(first_vec) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(4x_vec_end): @@ -394,8 +395,7 @@ L(4x_vec_end): movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx sub %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1): @@ -410,8 +410,7 @@ L(first_vec_x1): movzbl VEC_SIZE(%rsi, %rcx), %edx sub %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2): @@ -426,7 +425,6 @@ L(first_vec_x2): movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx sub %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN END (MEMCMP) #endif diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S new file mode 100644 index 0000000000..1ec1962e86 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S @@ -0,0 +1,17 @@ +#if IS_IN (libc) +# define VEC_SIZE 32 +# define VEC(i) ymm##i +# define VMOVNT vmovntdq +# define VMOVU vmovdqu +# define VMOVA vmovdqa + +# define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +# define VZEROUPPER_RETURN jmp L(return) + +# define SECTION(p) p##.avx.rtm +# define MEMMOVE_SYMBOL(p,s) p##_avx_##s##_rtm + +# include "memmove-vec-unaligned-erms.S" +#endif diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S index 50fffeb5ce..386624b3c4 100644 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -158,11 +158,12 @@ L(last_2x_vec): VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) VMOVU %VEC(0), (%rdi) VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) - VZEROUPPER #if !defined USE_MULTIARCH || !IS_IN (libc) L(nop): -#endif ret +#else + VZEROUPPER_RETURN +#endif #if defined USE_MULTIARCH && IS_IN (libc) END (MEMMOVE_SYMBOL (__memmove, unaligned)) @@ -255,8 +256,11 @@ L(last_2x_vec): VMOVU %VEC(0), (%rdi) VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) L(return): - VZEROUPPER +#if VEC_SIZE > 16 + ZERO_UPPER_VEC_REGISTERS_RETURN +#else ret +#endif L(movsb): cmpq __x86_shared_non_temporal_threshold(%rip), %rdx @@ -324,8 +328,7 @@ L(between_32_63): VMOVU -32(%rsi,%rdx), %YMM1 VMOVU %YMM0, (%rdi) VMOVU %YMM1, -32(%rdi,%rdx) - VZEROUPPER - ret + VZEROUPPER_RETURN #endif #if VEC_SIZE > 16 /* From 16 to 31. No branch when size == 16. */ @@ -334,7 +337,7 @@ L(between_16_31): VMOVU -16(%rsi,%rdx), %XMM1 VMOVU %XMM0, (%rdi) VMOVU %XMM1, -16(%rdi,%rdx) - ret + VZEROUPPER_RETURN #endif L(between_8_15): /* From 8 to 15. No branch when size == 8. */ @@ -387,8 +390,7 @@ L(more_2x_vec): VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) - VZEROUPPER - ret + VZEROUPPER_RETURN L(last_4x_vec): /* Copy from 2 * VEC to 4 * VEC. */ VMOVU (%rsi), %VEC(0) @@ -399,8 +401,7 @@ L(last_4x_vec): VMOVU %VEC(1), VEC_SIZE(%rdi) VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) - VZEROUPPER - ret + VZEROUPPER_RETURN L(more_8x_vec): cmpq %rsi, %rdi @@ -456,8 +457,7 @@ L(loop_4x_vec_forward): VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) /* Store the first VEC. */ VMOVU %VEC(4), (%r11) - VZEROUPPER - ret + VZEROUPPER_RETURN L(more_8x_vec_backward): /* Load the first 4 * VEC and last VEC to support overlapping @@ -508,8 +508,7 @@ L(loop_4x_vec_backward): VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) /* Store the last VEC. */ VMOVU %VEC(8), (%r11) - VZEROUPPER - ret + VZEROUPPER_RETURN #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) L(large_forward): @@ -544,8 +543,7 @@ L(loop_large_forward): VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) /* Store the first VEC. */ VMOVU %VEC(4), (%r11) - VZEROUPPER - ret + VZEROUPPER_RETURN L(large_backward): /* Don't use non-temporal store if there is overlap between @@ -579,8 +577,7 @@ L(loop_large_backward): VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) /* Store the last VEC. */ VMOVU %VEC(8), (%r11) - VZEROUPPER - ret + VZEROUPPER_RETURN #endif END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S new file mode 100644 index 0000000000..cea2d2a72d --- /dev/null +++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef MEMRCHR +# define MEMRCHR __memrchr_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "memrchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S index ce488dd9e8..20efe7ac7c 100644 --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S @@ -20,14 +20,22 @@ # include +# ifndef MEMRCHR +# define MEMRCHR __memrchr_avx2 +# endif + # ifndef VZEROUPPER # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits -ENTRY (__memrchr_avx2) + .section SECTION(.text),"ax",@progbits +ENTRY (MEMRCHR) /* Broadcast CHAR to YMM0. */ vmovd %esi, %xmm0 vpbroadcastb %xmm0, %ymm0 @@ -134,8 +142,8 @@ L(loop_4x_vec): vpmovmskb %ymm1, %eax bsrl %eax, %eax addq %rdi, %rax - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(last_4x_vec_or_less): @@ -169,8 +177,7 @@ L(last_4x_vec_or_less): addq %rax, %rdx jl L(zero) addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_2x_vec): @@ -191,31 +198,27 @@ L(last_2x_vec): jl L(zero) addl $(VEC_SIZE * 2), %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x0): bsrl %eax, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x1): bsrl %eax, %eax addl $VEC_SIZE, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x2): bsrl %eax, %eax addl $(VEC_SIZE * 2), %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x3): @@ -232,8 +235,7 @@ L(last_vec_x1_check): jl L(zero) addl $VEC_SIZE, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x3_check): @@ -243,12 +245,14 @@ L(last_vec_x3_check): jl L(zero) addl $(VEC_SIZE * 3), %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(zero): - VZEROUPPER + xorl %eax, %eax + VZEROUPPER_RETURN + + .p2align 4 L(null): xorl %eax, %eax ret @@ -273,8 +277,7 @@ L(last_vec_or_less_aligned): bsrl %eax, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_or_less): @@ -315,8 +318,7 @@ L(last_vec_or_less): bsrl %eax, %eax addq %rdi, %rax addq %r8, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_2x_aligned): @@ -353,7 +355,6 @@ L(last_vec_2x_aligned): bsrl %eax, %eax addq %rdi, %rax addq %r8, %rax - VZEROUPPER - ret -END (__memrchr_avx2) + VZEROUPPER_RETURN +END (MEMRCHR) #endif diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S new file mode 100644 index 0000000000..8ac3e479bb --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S @@ -0,0 +1,10 @@ +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return) + +#define SECTION(p) p##.avx.rtm +#define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm +#define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm + +#include "memset-avx2-unaligned-erms.S" diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S index 7ab3d89849..ae0860f36a 100644 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S @@ -14,9 +14,15 @@ movq r, %rax; \ vpbroadcastd %xmm0, %ymm0 -# define SECTION(p) p##.avx -# define MEMSET_SYMBOL(p,s) p##_avx2_##s -# define WMEMSET_SYMBOL(p,s) p##_avx2_##s +# ifndef SECTION +# define SECTION(p) p##.avx +# endif +# ifndef MEMSET_SYMBOL +# define MEMSET_SYMBOL(p,s) p##_avx2_##s +# endif +# ifndef WMEMSET_SYMBOL +# define WMEMSET_SYMBOL(p,s) p##_avx2_##s +# endif # include "memset-vec-unaligned-erms.S" #endif diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index 9f14e956d1..7747bc5d8b 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -45,17 +45,14 @@ #ifndef VZEROUPPER # if VEC_SIZE > 16 # define VZEROUPPER vzeroupper +# define VZEROUPPER_SHORT_RETURN vzeroupper; ret # else # define VZEROUPPER # endif #endif #ifndef VZEROUPPER_SHORT_RETURN -# if VEC_SIZE > 16 -# define VZEROUPPER_SHORT_RETURN vzeroupper -# else -# define VZEROUPPER_SHORT_RETURN rep -# endif +# define VZEROUPPER_SHORT_RETURN rep; ret #endif #ifndef MOVQ @@ -127,8 +124,7 @@ L(entry_from_bzero): /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(0), (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN #if defined USE_MULTIARCH && IS_IN (libc) END (MEMSET_SYMBOL (__memset, unaligned)) @@ -151,14 +147,12 @@ ENTRY (__memset_erms) ENTRY (MEMSET_SYMBOL (__memset, erms)) # endif L(stosb): - /* Issue vzeroupper before rep stosb. */ - VZEROUPPER mov %RDX_LP, %RCX_LP movzbl %sil, %eax mov %RDI_LP, %RDX_LP rep stosb mov %RDX_LP, %RAX_LP - ret + VZEROUPPER_RETURN # if VEC_SIZE == 16 END (__memset_erms) # else @@ -185,8 +179,7 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(0), (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN L(stosb_more_2x_vec): cmpq $REP_STOSB_THRESHOLD, %rdx @@ -200,8 +193,11 @@ L(more_2x_vec): VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) L(return): - VZEROUPPER +#if VEC_SIZE > 16 + ZERO_UPPER_VEC_REGISTERS_RETURN +#else ret +#endif L(loop_start): leaq (VEC_SIZE * 4)(%rdi), %rcx @@ -227,7 +223,6 @@ L(loop): cmpq %rcx, %rdx jne L(loop) VZEROUPPER_SHORT_RETURN - ret L(less_vec): /* Less than 1 VEC. */ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 @@ -251,40 +246,34 @@ L(less_vec): jb 1f movb %cl, (%rdi) 1: - VZEROUPPER - ret + VZEROUPPER_RETURN # if VEC_SIZE > 32 /* From 32 to 63. No branch when size == 32. */ L(between_32_63): VMOVU %YMM0, -32(%rdi,%rdx) VMOVU %YMM0, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN # endif # if VEC_SIZE > 16 /* From 16 to 31. No branch when size == 16. */ L(between_16_31): VMOVU %XMM0, -16(%rdi,%rdx) VMOVU %XMM0, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN # endif /* From 8 to 15. No branch when size == 8. */ L(between_8_15): movq %rcx, -8(%rdi,%rdx) movq %rcx, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN L(between_4_7): /* From 4 to 7. No branch when size == 4. */ movl %ecx, -4(%rdi,%rdx) movl %ecx, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN L(between_2_3): /* From 2 to 3. No branch when size == 2. */ movw %cx, -2(%rdi,%rdx) movw %cx, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN END (MEMSET_SYMBOL (__memset, unaligned_erms)) diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S new file mode 100644 index 0000000000..acc5f6e2fb --- /dev/null +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S @@ -0,0 +1,4 @@ +#define MEMCHR __rawmemchr_avx2_rtm +#define USE_AS_RAWMEMCHR 1 + +#include "memchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S new file mode 100644 index 0000000000..60a2ccfe53 --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_avx2_rtm +#include "strcpy-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S new file mode 100644 index 0000000000..637fb557c4 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRCAT +# define STRCAT __strcat_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strcat-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S new file mode 100644 index 0000000000..81f20d1d8e --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRCHR +# define STRCHR __strchr_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S index 47bc3c9949..da7d262065 100644 --- a/sysdeps/x86_64/multiarch/strchr-avx2.S +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S @@ -38,9 +38,13 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRCHR) movl %edi, %ecx /* Broadcast CHAR to YMM0. */ @@ -93,8 +97,8 @@ L(cros_page_boundary): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(aligned_more): @@ -190,8 +194,7 @@ L(first_vec_x0): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1): @@ -205,8 +208,7 @@ L(first_vec_x1): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2): @@ -220,8 +222,7 @@ L(first_vec_x2): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(4x_vec_end): @@ -247,8 +248,7 @@ L(first_vec_x3): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN END (STRCHR) #endif diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c index efe6584076..9bae2099d9 100644 --- a/sysdeps/x86_64/multiarch/strchr.c +++ b/sysdeps/x86_64/multiarch/strchr.c @@ -29,6 +29,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * @@ -44,6 +45,9 @@ IFUNC_SELECTOR (void) && CPU_FEATURES_CPU_P (cpu_features, BMI2)) return OPTIMIZE (evex); + if (CPU_FEATURES_CPU_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2); } diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S new file mode 100644 index 0000000000..cdcf818b91 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S @@ -0,0 +1,3 @@ +#define STRCHR __strchrnul_avx2_rtm +#define USE_AS_STRCHRNUL 1 +#include "strchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S new file mode 100644 index 0000000000..aecd30d97f --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRCMP +# define STRCMP __strcmp_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strcmp-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S index 8fb8eedcde..5d1c9d9018 100644 --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S @@ -55,6 +55,10 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + /* Warning! wcscmp/wcsncmp have to use SIGNED comparison for elements. strcmp/strncmp have to use UNSIGNED comparison for elements. @@ -75,7 +79,7 @@ the maximum offset is reached before a difference is found, zero is returned. */ - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRCMP) # ifdef USE_AS_STRNCMP /* Check for simple cases (0 or 1) in offset. */ @@ -137,8 +141,8 @@ L(return): movzbl (%rsi, %rdx), %edx subl %edx, %eax # endif - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(return_vec_size): @@ -171,8 +175,7 @@ L(return_vec_size): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(return_2_vec_size): @@ -205,8 +208,7 @@ L(return_2_vec_size): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(return_3_vec_size): @@ -239,8 +241,7 @@ L(return_3_vec_size): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(next_3_vectors): @@ -366,8 +367,7 @@ L(back_to_loop): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(test_vec): @@ -410,8 +410,7 @@ L(test_vec): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(test_2_vec): @@ -454,8 +453,7 @@ L(test_2_vec): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(test_3_vec): @@ -496,8 +494,7 @@ L(test_3_vec): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(loop_cross_page): @@ -566,8 +563,7 @@ L(loop_cross_page): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(loop_cross_page_2_vec): @@ -641,8 +637,7 @@ L(loop_cross_page_2_vec): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifdef USE_AS_STRNCMP L(string_nbyte_offset_check): @@ -684,8 +679,7 @@ L(cross_page_loop): # ifndef USE_AS_WCSCMP L(different): # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifdef USE_AS_WCSCMP .p2align 4 @@ -695,16 +689,14 @@ L(different): setl %al negl %eax orl $1, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN # endif # ifdef USE_AS_STRNCMP .p2align 4 L(zero): xorl %eax, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(char0): @@ -718,8 +710,7 @@ L(char0): movzbl (%rdi), %eax subl %ecx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # endif .p2align 4 @@ -744,8 +735,7 @@ L(last_vector): movzbl (%rsi, %rdx), %edx subl %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN /* Comparing on page boundary region requires special treatment: It must done one vector at the time, starting with the wider @@ -866,7 +856,6 @@ L(cross_page_4bytes): testl %eax, %eax jne L(cross_page_loop) subl %ecx, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN END (STRCMP) #endif diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c index e947cefb08..c99c08aa3f 100644 --- a/sysdeps/x86_64/multiarch/strcmp.c +++ b/sysdeps/x86_64/multiarch/strcmp.c @@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * @@ -46,6 +47,9 @@ IFUNC_SELECTOR (void) && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) return OPTIMIZE (evex); + if (CPU_FEATURES_CPU_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2); } diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S new file mode 100644 index 0000000000..c2c581ecf7 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRCPY +# define STRCPY __strcpy_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strcpy-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S new file mode 100644 index 0000000000..75b4b7612c --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRLEN +# define STRLEN __strlen_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strlen-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S index 645e04461f..82826e1098 100644 --- a/sysdeps/x86_64/multiarch/strlen-avx2.S +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S @@ -36,9 +36,13 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRLEN) # ifdef USE_AS_STRNLEN /* Check for zero length. */ @@ -111,8 +115,8 @@ L(cros_page_boundary): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(aligned_more): @@ -231,8 +235,7 @@ L(last_4x_vec_or_less): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_2x_vec): @@ -253,8 +256,7 @@ L(last_2x_vec): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x0_check): @@ -267,8 +269,7 @@ L(first_vec_x0_check): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1_check): @@ -282,8 +283,7 @@ L(first_vec_x1_check): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2_check): @@ -297,8 +297,7 @@ L(first_vec_x2_check): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x3_check): @@ -312,8 +311,7 @@ L(first_vec_x3_check): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(max): @@ -321,8 +319,7 @@ L(max): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(zero): @@ -338,8 +335,7 @@ L(first_vec_x0): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1): @@ -350,8 +346,7 @@ L(first_vec_x1): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2): @@ -362,8 +357,7 @@ L(first_vec_x2): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(4x_vec_end): @@ -389,8 +383,7 @@ L(first_vec_x3): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN END (STRLEN) #endif diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S new file mode 100644 index 0000000000..0dcea18dbb --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCAT +#define STRCAT __strncat_avx2_rtm +#include "strcat-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S new file mode 100644 index 0000000000..37d1224bb9 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S @@ -0,0 +1,3 @@ +#define STRCMP __strncmp_avx2_rtm +#define USE_AS_STRNCMP 1 +#include "strcmp-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c index 4069946f80..880e39659f 100644 --- a/sysdeps/x86_64/multiarch/strncmp.c +++ b/sysdeps/x86_64/multiarch/strncmp.c @@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * @@ -46,6 +47,9 @@ IFUNC_SELECTOR (void) && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) return OPTIMIZE (evex); + if (CPU_FEATURES_CPU_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2); } diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S new file mode 100644 index 0000000000..79e7083299 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_avx2_rtm +#include "strcpy-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S new file mode 100644 index 0000000000..04f1626a5c --- /dev/null +++ b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S @@ -0,0 +1,4 @@ +#define STRLEN __strnlen_avx2_rtm +#define USE_AS_STRNLEN 1 + +#include "strlen-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S new file mode 100644 index 0000000000..5def14ec1c --- /dev/null +++ b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRRCHR +# define STRRCHR __strrchr_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strrchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S index 4381e6ab3e..9f22a15e25 100644 --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S @@ -36,9 +36,13 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRRCHR) movd %esi, %xmm4 movl %edi, %ecx @@ -166,8 +170,8 @@ L(return_value): # endif bsrl %eax, %eax leaq -VEC_SIZE(%rdi, %rax), %rax - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(match): @@ -198,8 +202,7 @@ L(find_nul): jz L(return_value) bsrl %eax, %eax leaq -VEC_SIZE(%rdi, %rax), %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(char_and_nul): @@ -222,14 +225,12 @@ L(char_and_nul_in_first_vec): jz L(return_null) bsrl %eax, %eax leaq -VEC_SIZE(%rdi, %rax), %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(return_null): xorl %eax, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN END (STRRCHR) #endif diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S new file mode 100644 index 0000000000..d49dbbf0b4 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S @@ -0,0 +1,3 @@ +#define STRCHR __wcschr_avx2_rtm +#define USE_AS_WCSCHR 1 +#include "strchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S new file mode 100644 index 0000000000..d6ca2b8064 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S @@ -0,0 +1,4 @@ +#define STRCMP __wcscmp_avx2_rtm +#define USE_AS_WCSCMP 1 + +#include "strcmp-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S new file mode 100644 index 0000000000..35658d7365 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S @@ -0,0 +1,4 @@ +#define STRLEN __wcslen_avx2_rtm +#define USE_AS_WCSLEN 1 + +#include "strlen-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S new file mode 100644 index 0000000000..4e88c70cc6 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S @@ -0,0 +1,5 @@ +#define STRCMP __wcsncmp_avx2_rtm +#define USE_AS_STRNCMP 1 +#define USE_AS_WCSCMP 1 + +#include "strcmp-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S new file mode 100644 index 0000000000..7437ebee2d --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S @@ -0,0 +1,5 @@ +#define STRLEN __wcsnlen_avx2_rtm +#define USE_AS_WCSLEN 1 +#define USE_AS_STRNLEN 1 + +#include "strlen-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c index f0bcfd180d..95f3cfb2cf 100644 --- a/sysdeps/x86_64/multiarch/wcsnlen.c +++ b/sysdeps/x86_64/multiarch/wcsnlen.c @@ -29,6 +29,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * @@ -44,6 +45,9 @@ IFUNC_SELECTOR (void) && CPU_FEATURES_CPU_P (cpu_features, BMI2)) return OPTIMIZE (evex); + if (CPU_FEATURES_CPU_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2); } diff --git a/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S new file mode 100644 index 0000000000..9bf760833f --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S @@ -0,0 +1,3 @@ +#define STRRCHR __wcsrchr_avx2_rtm +#define USE_AS_WCSRCHR 1 +#include "strrchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S new file mode 100644 index 0000000000..58ed21db01 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S @@ -0,0 +1,4 @@ +#define MEMCHR __wmemchr_avx2_rtm +#define USE_AS_WMEMCHR 1 + +#include "memchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S new file mode 100644 index 0000000000..31104d1215 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S @@ -0,0 +1,4 @@ +#define MEMCMP __wmemcmp_avx2_movbe_rtm +#define USE_AS_WMEMCMP 1 + +#include "memcmp-avx2-movbe-rtm.S" diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h index 1738d7f955..223f1a5949 100644 --- a/sysdeps/x86_64/sysdep.h +++ b/sysdeps/x86_64/sysdep.h @@ -95,6 +95,28 @@ lose: \ #define R14_LP r14 #define R15_LP r15 +/* Zero upper vector registers and return with xtest. NB: Use VZEROALL + to avoid RTM abort triggered by VZEROUPPER inside transactionally. */ +#define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \ + xtest; \ + jz 1f; \ + vzeroall; \ + ret; \ +1: \ + vzeroupper; \ + ret + +/* Zero upper vector registers and return. */ +#ifndef ZERO_UPPER_VEC_REGISTERS_RETURN +# define ZERO_UPPER_VEC_REGISTERS_RETURN \ + VZEROUPPER; \ + ret +#endif + +#ifndef VZEROUPPER_RETURN +# define VZEROUPPER_RETURN VZEROUPPER; ret +#endif + #else /* __ASSEMBLER__ */ /* Long and pointer size in bytes. */ -- cgit v1.2.3