From ef9c4cb6c7abb6340b52e19de31d2a56c8de5844 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 5 Jun 2017 11:09:48 -0700 Subject: x86-64: Optimize wmemset with SSE2/AVX2/AVX512 The difference between memset and wmemset is byte vs int. Add stubs to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size: SSE2 wmemset: shl $0x2,%rdx movd %esi,%xmm0 mov %rdi,%rax pshufd $0x0,%xmm0,%xmm0 jmp entry_from_wmemset SSE2 memset: movd %esi,%xmm0 mov %rdi,%rax punpcklbw %xmm0,%xmm0 punpcklwd %xmm0,%xmm0 pshufd $0x0,%xmm0,%xmm0 entry_from_wmemset: Since the ERMS versions of wmemset requires "rep stosl" instead of "rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset are added. The SSE2 wmemset is about 3X faster and the AVX2 wmemset is about 6X faster on Haswell. * include/wchar.h (__wmemset_chk): New. * sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_CHK_SYMBOL): Likewise. (WMEMSET_SYMBOL): Likewise. (__wmemset): Add hidden definition. (wmemset): Add weak hidden definition. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add wmemset_chk-nonshared. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned, __wmemset_avx2_unaligned, __wmemset_avx512_unaligned, __wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned and __wmemset_chk_avx512_unaligned. * sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ... (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_SYMBOL): Likewise. * sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ... (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This. (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New. (WMEMSET_SYMBOL): Likewise. * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated. (WMEMSET_CHK_SYMBOL): New. (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise. (WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise. * sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New. (libc_hidden_builtin_def): Also define __GI_wmemset and __GI___wmemset. (weak_alias): New. * sysdeps/x86_64/multiarch/wmemset.c: New file. * sysdeps/x86_64/multiarch/wmemset.h: Likewise. * sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S: Likewise. * sysdeps/x86_64/multiarch/wmemset_chk.c: Likewise. * sysdeps/x86_64/wmemset.c: Likewise. * sysdeps/x86_64/wmemset_chk.c: Likewise. --- sysdeps/x86_64/memset.S | 18 +++++++++- sysdeps/x86_64/multiarch/Makefile | 4 +++ sysdeps/x86_64/multiarch/ifunc-impl-list.c | 22 ++++++++++++ .../x86_64/multiarch/memset-avx2-unaligned-erms.S | 8 ++++- .../multiarch/memset-avx512-unaligned-erms.S | 9 ++++- .../x86_64/multiarch/memset-vec-unaligned-erms.S | 24 +++++++++++-- sysdeps/x86_64/multiarch/memset.S | 13 +++++-- sysdeps/x86_64/multiarch/wmemset.c | 33 +++++++++++++++++ sysdeps/x86_64/multiarch/wmemset.h | 42 ++++++++++++++++++++++ sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S | 21 +++++++++++ sysdeps/x86_64/multiarch/wmemset_chk.c | 31 ++++++++++++++++ sysdeps/x86_64/wmemset.S | 1 + sysdeps/x86_64/wmemset_chk.S | 33 +++++++++++++++++ 13 files changed, 250 insertions(+), 9 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/wmemset.c create mode 100644 sysdeps/x86_64/multiarch/wmemset.h create mode 100644 sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S create mode 100644 sysdeps/x86_64/multiarch/wmemset_chk.c create mode 100644 sysdeps/x86_64/wmemset.S create mode 100644 sysdeps/x86_64/wmemset_chk.S (limited to 'sysdeps/x86_64') diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S index 69ed509c28..41278787fe 100644 --- a/sysdeps/x86_64/memset.S +++ b/sysdeps/x86_64/memset.S @@ -26,13 +26,18 @@ #define VMOVU movdqu #define VMOVA movdqa -#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ movd d, %xmm0; \ movq r, %rax; \ punpcklbw %xmm0, %xmm0; \ punpcklwd %xmm0, %xmm0; \ pshufd $0, %xmm0, %xmm0 +#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ + movq r, %rax; \ + pshufd $0, %xmm0, %xmm0 + #define SECTION(p) p #ifndef MEMSET_SYMBOL @@ -40,10 +45,21 @@ # define MEMSET_SYMBOL(p,s) memset #endif +#ifndef WMEMSET_SYMBOL +# define WMEMSET_CHK_SYMBOL(p,s) p +# define WMEMSET_SYMBOL(p,s) __wmemset +#endif + #include "multiarch/memset-vec-unaligned-erms.S" libc_hidden_builtin_def (memset) +#if IS_IN (libc) +libc_hidden_def (__wmemset) +weak_alias (__wmemset, wmemset) +libc_hidden_weak (wmemset) +#endif + #if defined SHARED && IS_IN (libc) && !defined USE_MULTIARCH strong_alias (__memset_chk, __memset_zero_constant_len_parameter) .section .gnu.warning.__memset_zero_constant_len_parameter diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 3736f54ce4..65a545ba01 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -32,3 +32,7 @@ endif ifeq ($(subdir),wcsmbs) sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c endif + +ifeq ($(subdir),debug) +sysdep_routines += wmemset_chk-nonshared +endif diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 06d9a9d7f7..a91d2f9efb 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -300,6 +300,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __wmemcmp_ssse3) IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) + /* Support sysdeps/x86_64/multiarch/wmemset.S. */ + IFUNC_IMPL (i, name, wmemset, + IFUNC_IMPL_ADD (array, i, wmemset, 1, + __wmemset_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, + HAS_ARCH_FEATURE (AVX2_Usable), + __wmemset_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, + HAS_ARCH_FEATURE (AVX512F_Usable), + __wmemset_avx512_unaligned)) + #ifdef SHARED /* Support sysdeps/x86_64/multiarch/memcpy_chk.S. */ IFUNC_IMPL (i, name, __memcpy_chk, @@ -417,6 +428,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3), __strncmp_ssse3) IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2)) + + /* Support sysdeps/x86_64/multiarch/wmemset_chk.S. */ + IFUNC_IMPL (i, name, __wmemset_chk, + IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1, + __wmemset_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, + HAS_ARCH_FEATURE (AVX2_Usable), + __wmemset_chk_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __wmemset_chk_avx512_unaligned)) #endif return i; diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S index 79975e0825..7ab3d89849 100644 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S @@ -4,13 +4,19 @@ # define VMOVU vmovdqu # define VMOVA vmovdqa -# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ vmovd d, %xmm0; \ movq r, %rax; \ vpbroadcastb %xmm0, %ymm0 +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastd %xmm0, %ymm0 + # define SECTION(p) p##.avx # define MEMSET_SYMBOL(p,s) p##_avx2_##s +# define WMEMSET_SYMBOL(p,s) p##_avx2_##s # include "memset-vec-unaligned-erms.S" #endif diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S index a5ec349198..0783979ca5 100644 --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S @@ -4,14 +4,21 @@ # define VMOVU vmovdqu64 # define VMOVA vmovdqa64 -# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ vmovd d, %xmm0; \ movq r, %rax; \ vpbroadcastb %xmm0, %xmm0; \ vpbroadcastq %xmm0, %zmm0 +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastd %xmm0, %xmm0; \ + vpbroadcastq %xmm0, %zmm0 + # define SECTION(p) p##.avx512 # define MEMSET_SYMBOL(p,s) p##_avx512_##s +# define WMEMSET_SYMBOL(p,s) p##_avx512_##s # include "memset-vec-unaligned-erms.S" #endif diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index 704eed9a3f..2eb9e3744e 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -30,6 +30,10 @@ # define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s) #endif +#ifndef WMEMSET_CHK_SYMBOL +# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s) +#endif + #ifndef VZEROUPPER # if VEC_SIZE > 16 # define VZEROUPPER vzeroupper @@ -79,6 +83,21 @@ END (__bzero) weak_alias (__bzero, bzero) #endif +#if IS_IN (libc) +# if defined SHARED +ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) +# endif + +ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) + shlq $2, %rdx + WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + jmp L(entry_from_bzero) +END (WMEMSET_SYMBOL (__wmemset, unaligned)) +#endif + #if defined SHARED && IS_IN (libc) ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) cmpq %rdx, %rcx @@ -87,8 +106,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) #endif ENTRY (MEMSET_SYMBOL (__memset, unaligned)) -L(memset_entry): - VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) L(entry_from_bzero): cmpq $VEC_SIZE, %rdx jb L(less_vec) @@ -132,7 +150,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) # endif ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) - VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) cmpq $VEC_SIZE, %rdx jb L(less_vec) cmpq $(VEC_SIZE * 2), %rdx diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S index 9d33118cf8..11f27378b0 100644 --- a/sysdeps/x86_64/multiarch/memset.S +++ b/sysdeps/x86_64/multiarch/memset.S @@ -58,16 +58,23 @@ END(memset) #if IS_IN (libc) # define MEMSET_SYMBOL(p,s) p##_sse2_##s +# define WMEMSET_SYMBOL(p,s) p##_sse2_##s # ifdef SHARED -# undef libc_hidden_builtin_def +# undef libc_hidden_builtin_def /* It doesn't make sense to send libc-internal memset calls through a PLT. The speedup we get from using SSE2 instructions is likely eaten away by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_memset; __GI_memset = __memset_sse2_unaligned +# define libc_hidden_builtin_def(name) \ + .globl __GI_memset; __GI_memset = __memset_sse2_unaligned; \ + .globl __GI_wmemset; __GI_wmemset = __wmemset_sse2_unaligned; \ + .globl __GI___wmemset; __GI___wmemset = __wmemset_sse2_unaligned # endif +# undef weak_alias +# define weak_alias(original, alias) \ + .weak bzero; bzero = __bzero + # undef strong_alias # define strong_alias(original, alias) #endif diff --git a/sysdeps/x86_64/multiarch/wmemset.c b/sysdeps/x86_64/multiarch/wmemset.c new file mode 100644 index 0000000000..61626a9e74 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemset.c @@ -0,0 +1,33 @@ +/* Multiple versions of wmemset. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define wmemset __redirect_wmemset +# define __wmemset __redirect___wmemset +# include +# undef wmemset +# undef __wmemset + +# define SYMBOL_NAME wmemset +# include "wmemset.h" + +libc_ifunc_redirected (__redirect_wmemset, __wmemset, IFUNC_SELECTOR ()); +weak_alias (__wmemset, wmemset) +#endif diff --git a/sysdeps/x86_64/multiarch/wmemset.h b/sysdeps/x86_64/multiarch/wmemset.h new file mode 100644 index 0000000000..d761985a47 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemset.h @@ -0,0 +1,42 @@ +/* Common definition for wmemset/wmemset_chk ifunc selections. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) + && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) + return OPTIMIZE (avx512_unaligned); + else + return OPTIMIZE (avx2_unaligned); + } + + return OPTIMIZE (sse2_unaligned); +} diff --git a/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S b/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S new file mode 100644 index 0000000000..0a537fe272 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S @@ -0,0 +1,21 @@ +/* Non-shared version of wmemset_chk for x86-64. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) && !defined SHARED +# include "../wmemset_chk.S" +#endif diff --git a/sysdeps/x86_64/multiarch/wmemset_chk.c b/sysdeps/x86_64/multiarch/wmemset_chk.c new file mode 100644 index 0000000000..2c039a5141 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemset_chk.c @@ -0,0 +1,31 @@ +/* Multiple versions of wmemset_chk. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Define multiple versions only for the definition in libc.so. */ +#if IS_IN (libc) && defined SHARED +# define __wmemset_chk __redirect_wmemset_chk +# include +# undef __wmemset_chk + +# define SYMBOL_NAME wmemset_chk +# include "wmemset.h" + +libc_ifunc_redirected (__redirect_wmemset_chk, __wmemset_chk, + IFUNC_SELECTOR ()); +#endif diff --git a/sysdeps/x86_64/wmemset.S b/sysdeps/x86_64/wmemset.S new file mode 100644 index 0000000000..f96d567fd8 --- /dev/null +++ b/sysdeps/x86_64/wmemset.S @@ -0,0 +1 @@ +/* Implemented in memset.S. */ diff --git a/sysdeps/x86_64/wmemset_chk.S b/sysdeps/x86_64/wmemset_chk.S new file mode 100644 index 0000000000..64c277413f --- /dev/null +++ b/sysdeps/x86_64/wmemset_chk.S @@ -0,0 +1,33 @@ +/* Checking wmemset for x86-64. + Copyright (C) 2004-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include "asm-syntax.h" + +#ifndef SHARED + /* For libc.so this is defined in wmemset.S. + For libc.a, this is a separate source to avoid + wmemset bringing in __chk_fail and all routines + it calls. */ + .text +ENTRY (__wmemset_chk) + cmpq %rdx, %rcx + jb __chk_fail + jmp wmemset +END (__wmemset_chk) +#endif -- cgit v1.2.3