diff options
author | H.J. Lu <hjl.tools@gmail.com> | 2016-03-25 08:20:17 -0700 |
---|---|---|
committer | H.J. Lu <hjl.tools@gmail.com> | 2016-03-31 09:00:41 -0700 |
commit | 7df7c6a195d6bc6ffdd90db0786d5de9c67d037a (patch) | |
tree | e406db6d384ec00077f4d8feb85364476cc3e1d5 | |
parent | d1f2de07cb44abfb9e78f825e3edf2490cf1057c (diff) | |
download | glibc-7df7c6a195d6bc6ffdd90db0786d5de9c67d037a.tar glibc-7df7c6a195d6bc6ffdd90db0786d5de9c67d037a.tar.gz glibc-7df7c6a195d6bc6ffdd90db0786d5de9c67d037a.tar.bz2 glibc-7df7c6a195d6bc6ffdd90db0786d5de9c67d037a.zip |
Add x86-64 memset with unaligned store and rep stosb
Implement x86-64 memset with unaligned store and rep movsb. Support
16-byte, 32-byte and 64-byte vector register sizes. A single file
provides 2 implementations of memset, one with rep stosb and the other
without rep stosb. They share the same codes when size is between 2
times of vector register size and REP_STOSB_THRESHOLD which defaults
to 2KB.
Key features:
1. Use overlapping store to avoid branch.
2. For size <= 4 times of vector register size, fully unroll the loop.
3. For size > 4 times of vector register size, store 4 times of vector
register size at a time.
[BZ #19881]
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
memset-sse2-unaligned-erms, memset-avx2-unaligned-erms and
memset-avx512-unaligned-erms.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test __memset_chk_sse2_unaligned,
__memset_chk_sse2_unaligned_erms, __memset_chk_avx2_unaligned,
__memset_chk_avx2_unaligned_erms, __memset_chk_avx512_unaligned,
__memset_chk_avx512_unaligned_erms, __memset_sse2_unaligned,
__memset_sse2_unaligned_erms, __memset_erms,
__memset_avx2_unaligned, __memset_avx2_unaligned_erms,
__memset_avx512_unaligned_erms and __memset_avx512_unaligned.
* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S: New
file.
* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S:
Likewise.
* sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S:
Likewise.
* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S:
Likewise.
-rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 5 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/ifunc-impl-list.c | 33 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S | 14 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S | 17 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S | 16 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 251 |
6 files changed, 335 insertions, 1 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index ef4dbc0c6f..8878efbc8f 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -23,7 +23,10 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ memset-avx512-no-vzeroupper \ memmove-sse2-unaligned-erms \ memmove-avx-unaligned-erms \ - memmove-avx512-unaligned-erms + memmove-avx512-unaligned-erms \ + memset-sse2-unaligned-erms \ + memset-avx2-unaligned-erms \ + memset-avx512-unaligned-erms CFLAGS-varshift.c += -msse4 CFLAGS-strcspn-c.c += -msse4 CFLAGS-strpbrk-c.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 9204da450a..1e880f6edc 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -118,12 +118,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, __memset_chk, IFUNC_IMPL_ADD (array, i, __memset_chk, 1, __memset_chk_sse2) + IFUNC_IMPL_ADD (array, i, __memset_chk, 1, + __memset_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, 1, + __memset_chk_sse2_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memset_chk, HAS_ARCH_FEATURE (AVX2_Usable), __memset_chk_avx2) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX2_Usable), + __memset_chk_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX2_Usable), + __memset_chk_avx2_unaligned_erms) #ifdef HAVE_AVX512_ASM_SUPPORT IFUNC_IMPL_ADD (array, i, __memset_chk, HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_chk_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), __memset_chk_avx512_no_vzeroupper) #endif ) @@ -131,12 +147,29 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/memset.S. */ IFUNC_IMPL (i, name, memset, IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2) + IFUNC_IMPL_ADD (array, i, memset, 1, + __memset_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, memset, 1, + __memset_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms) IFUNC_IMPL_ADD (array, i, memset, HAS_ARCH_FEATURE (AVX2_Usable), __memset_avx2) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX2_Usable), + __memset_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX2_Usable), + __memset_avx2_unaligned_erms) #ifdef HAVE_AVX512_ASM_SUPPORT IFUNC_IMPL_ADD (array, i, memset, HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX512F_Usable), __memset_avx512_no_vzeroupper) #endif ) diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S new file mode 100644 index 0000000000..e0dc56512e --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S @@ -0,0 +1,14 @@ +#define VEC_SIZE 32 +#define VEC(i) ymm##i +#define VMOVU vmovdqu +#define VMOVA vmovdqa + +#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastb %xmm0, %ymm0 + +#define SECTION(p) p##.avx +#define MEMSET_SYMBOL(p,s) p##_avx2_##s + +#include "memset-vec-unaligned-erms.S" diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S new file mode 100644 index 0000000000..72f4095831 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S @@ -0,0 +1,17 @@ +#ifdef HAVE_AVX512_ASM_SUPPORT +# define VEC_SIZE 64 +# define VEC(i) zmm##i +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastb %xmm0, %xmm0; \ + vpbroadcastq %xmm0, %zmm0 + +# define SECTION(p) p##.avx512 +# define MEMSET_SYMBOL(p,s) p##_avx512_##s + +# include "memset-vec-unaligned-erms.S" +#endif diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S new file mode 100644 index 0000000000..437a858dab --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S @@ -0,0 +1,16 @@ +#define VEC_SIZE 16 +#define VEC(i) xmm##i +#define VMOVU movdqu +#define VMOVA movdqa + +#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ + movq r, %rax; \ + punpcklbw %xmm0, %xmm0; \ + punpcklwd %xmm0, %xmm0; \ + pshufd $0, %xmm0, %xmm0 + +#define SECTION(p) p +#define MEMSET_SYMBOL(p,s) p##_sse2_##s + +#include "memset-vec-unaligned-erms.S" diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S new file mode 100644 index 0000000000..9383517536 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -0,0 +1,251 @@ +/* memset/bzero with unaligned store and rep stosb + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* memset is implemented as: + 1. Use overlapping store to avoid branch. + 2. Force 32-bit displacement for branches to avoid long nop between + instructions. + 3. If size is less than VEC, use integer register stores. + 4. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores. + 5. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores. + 6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with + 4 VEC stores and store 4 * VEC at a time until done. */ + +#include <sysdep.h> + +#ifndef VZEROUPPER +# if VEC_SIZE > 16 +# define VZEROUPPER vzeroupper +# else +# define VZEROUPPER +# endif +#endif + +#ifndef VZEROUPPER_SHORT_RETURN +# if VEC_SIZE > 16 +# define VZEROUPPER_SHORT_RETURN vzeroupper +# else +# define VZEROUPPER_SHORT_RETURN rep +# endif +#endif + +#ifndef MOVQ +# if VEC_SIZE > 16 +# define MOVQ vmovq +# else +# define MOVQ movq +# endif +#endif + +/* Threshold to use Enhanced REP STOSB. Since there is overhead to set + up REP STOSB operation, REP STOSB isn't faster on short data. The + memset micro benchmark in glibc shows that 2KB is the approximate + value above which REP STOSB becomes faster on processors with + Enhanced REP STOSB. Since the stored value is fixed, larger register + size has minimal impact on threshold. */ +#ifndef REP_STOSB_THRESHOLD +# define REP_STOSB_THRESHOLD 2048 +#endif + +#ifndef SECTION +# error SECTION is not defined! +#endif + +#if !defined USE_MULTIARCH && IS_IN (libc) + .section SECTION(.text),"ax",@progbits +ENTRY (__bzero) + movq %rdi, %rax /* Set return value. */ + movq %rsi, %rdx /* Set n. */ + pxor %xmm0, %xmm0 + jmp L(entry_from_bzero) +END (__bzero) +weak_alias (__bzero, bzero) +#endif + +#if defined SHARED && IS_IN (libc) +ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned)) +#endif + +ENTRY (MEMSET_SYMBOL (__memset, unaligned)) +L(memset_entry): + VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) +L(entry_from_bzero): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), (%rdi) + VZEROUPPER + ret +END (MEMSET_SYMBOL (__memset, unaligned)) + +#if VEC_SIZE == 16 +/* Only used to measure performance of REP STOSB. */ +ENTRY (__memset_erms) +#else +/* Provide a symbol to debugger. */ +ENTRY (MEMSET_SYMBOL (__memset, erms)) +#endif +L(stosb): + movq %rdx, %rcx + movzbl %sil, %eax + movq %rdi, %rdx + rep stosb + movq %rdx, %rax + ret +#if VEC_SIZE == 16 +END (__memset_erms) +#else +END (MEMSET_SYMBOL (__memset, erms)) +#endif + +#if defined SHARED && IS_IN (libc) +ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms)) +#endif + +ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) + VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(stosb_more_2x_vec) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), (%rdi) + VZEROUPPER + ret + + .p2align 4 +L(stosb_more_2x_vec): + cmpq $REP_STOSB_THRESHOLD, %rdx + /* Force 32-bit displacement to avoid long nop between + instructions. */ + ja.d32 L(stosb) + .p2align 4 +L(more_2x_vec): + cmpq $(VEC_SIZE * 4), %rdx + ja L(loop_start) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(0), VEC_SIZE(%rdi) + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) +L(return): + VZEROUPPER + ret + + .p2align 4 +L(loop_start): + leaq (VEC_SIZE * 4)(%rdi), %rcx + VMOVU %VEC(0), (%rdi) + andq $-(VEC_SIZE * 4), %rcx + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), VEC_SIZE(%rdi) + VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) + VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx) + VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi) + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx) + addq %rdi, %rdx + andq $-(VEC_SIZE * 4), %rdx + cmpq %rdx, %rcx +# if VEC_SIZE == 32 || VEC_SIZE == 64 + /* Force 32-bit displacement to avoid long nop between + instructions. */ + je.d32 L(return) +# else + je L(return) +# endif + .p2align 4 +L(loop): + VMOVA %VEC(0), (%rcx) + VMOVA %VEC(0), VEC_SIZE(%rcx) + VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx) + VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx) + addq $(VEC_SIZE * 4), %rcx + cmpq %rcx, %rdx + jne L(loop) + VZEROUPPER_SHORT_RETURN + ret +L(less_vec): + /* Less than 1 VEC. */ +# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 +# error Unsupported VEC_SIZE! +# endif +# if VEC_SIZE > 32 + cmpb $32, %dl + jae L(between_32_63) +# endif +# if VEC_SIZE > 16 + cmpb $16, %dl + jae L(between_16_31) +# endif + MOVQ %xmm0, %rcx + cmpb $8, %dl + jae L(between_8_15) + cmpb $4, %dl + jae L(between_4_7) + cmpb $1, %dl + ja L(between_2_3) + jb 1f + movb %cl, (%rdi) +1: + VZEROUPPER + ret +# if VEC_SIZE > 32 + /* From 32 to 63. No branch when size == 32. */ +L(between_32_63): + vmovdqu %ymm0, -32(%rdi,%rdx) + vmovdqu %ymm0, (%rdi) + VZEROUPPER + ret +# endif +# if VEC_SIZE > 16 + /* From 16 to 31. No branch when size == 16. */ +L(between_16_31): + vmovdqu %xmm0, -16(%rdi,%rdx) + vmovdqu %xmm0, (%rdi) + VZEROUPPER + ret +# endif + /* From 8 to 15. No branch when size == 8. */ +L(between_8_15): + movq %rcx, -8(%rdi,%rdx) + movq %rcx, (%rdi) + VZEROUPPER + ret +L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ + movl %ecx, -4(%rdi,%rdx) + movl %ecx, (%rdi) + VZEROUPPER + ret +L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ + movw %cx, -2(%rdi,%rdx) + movw %cx, (%rdi) + VZEROUPPER + ret +END (MEMSET_SYMBOL (__memset, unaligned_erms)) |