diff options
-rw-r--r-- | sysdeps/x86_64/memset.S | 10 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S | 10 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S | 11 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S | 11 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 285 |
5 files changed, 232 insertions, 95 deletions
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S index 7d4a327eba..0137eba4cd 100644 --- a/sysdeps/x86_64/memset.S +++ b/sysdeps/x86_64/memset.S @@ -18,13 +18,15 @@ <https://www.gnu.org/licenses/>. */ #include <sysdep.h> +#define USE_WITH_SSE2 1 #define VEC_SIZE 16 +#define MOV_SIZE 3 +#define RET_SIZE 1 + #define VEC(i) xmm##i -/* Don't use movups and movaps since it will get larger nop paddings for - alignment. */ -#define VMOVU movdqu -#define VMOVA movdqa +#define VMOVU movups +#define VMOVA movaps #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ movd d, %xmm0; \ diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S index ae0860f36a..1af668af0a 100644 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S @@ -1,8 +1,14 @@ #if IS_IN (libc) +# define USE_WITH_AVX2 1 + # define VEC_SIZE 32 +# define MOV_SIZE 4 +# define RET_SIZE 4 + # define VEC(i) ymm##i -# define VMOVU vmovdqu -# define VMOVA vmovdqa + +# define VMOVU vmovdqu +# define VMOVA vmovdqa # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ vmovd d, %xmm0; \ diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S index 8ad842fc2f..f14d6f8493 100644 --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S @@ -1,11 +1,18 @@ #if IS_IN (libc) +# define USE_WITH_AVX512 1 + # define VEC_SIZE 64 +# define MOV_SIZE 6 +# define RET_SIZE 1 + # define XMM0 xmm16 # define YMM0 ymm16 # define VEC0 zmm16 # define VEC(i) VEC##i -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 + +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + # define VZEROUPPER # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S index 640f092903..64b09e77cc 100644 --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S @@ -1,11 +1,18 @@ #if IS_IN (libc) +# define USE_WITH_EVEX 1 + # define VEC_SIZE 32 +# define MOV_SIZE 6 +# define RET_SIZE 1 + # define XMM0 xmm16 # define YMM0 ymm16 # define VEC0 ymm16 # define VEC(i) VEC##i -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 + +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + # define VZEROUPPER # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index ff196844a0..e723413a66 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -63,8 +63,27 @@ # endif #endif +#if VEC_SIZE == 64 +# define LOOP_4X_OFFSET (VEC_SIZE * 4) +#else +# define LOOP_4X_OFFSET (0) +#endif + +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 +# define END_REG rcx +# define LOOP_REG rdi +#else +# define END_REG rdi +# define LOOP_REG rdx +#endif + #define PAGE_SIZE 4096 +/* Macro to calculate size of small memset block for aligning + purposes. */ +#define SMALL_MEMSET_ALIGN(mov_sz, ret_sz) (2 * (mov_sz) + (ret_sz) + 1) + + #ifndef SECTION # error SECTION is not defined! #endif @@ -74,6 +93,7 @@ ENTRY (__bzero) mov %RDI_LP, %RAX_LP /* Set return value. */ mov %RSI_LP, %RDX_LP /* Set n. */ + xorl %esi, %esi pxor %XMM0, %XMM0 jmp L(entry_from_bzero) END (__bzero) @@ -158,7 +178,7 @@ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) # endif -ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) +ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) # ifdef __ILP32__ /* Clear the upper 32 bits. */ @@ -168,75 +188,43 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) jb L(less_vec) cmp $(VEC_SIZE * 2), %RDX_LP ja L(stosb_more_2x_vec) - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ - VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) - VMOVU %VEC(0), (%rdi) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. + */ + VMOVU %VEC(0), (%rax) + VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) VZEROUPPER_RETURN - - .p2align 4 -L(stosb_more_2x_vec): - cmp __x86_rep_stosb_threshold(%rip), %RDX_LP - ja L(stosb) -#else - .p2align 4 #endif -L(more_2x_vec): - /* Stores to first 2x VEC before cmp as any path forward will - require it. */ - VMOVU %VEC(0), (%rdi) - VMOVU %VEC(0), VEC_SIZE(%rdi) - cmpq $(VEC_SIZE * 4), %rdx - ja L(loop_start) - VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) - VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) -L(return): -#if VEC_SIZE > 16 - ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4,, 10 +L(last_2x_vec): +#ifdef USE_LESS_VEC_MASK_STORE + VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx) + VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx) #else - ret + VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi) + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi) #endif + VZEROUPPER_RETURN -L(loop_start): - VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi) - VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi) - cmpq $(VEC_SIZE * 8), %rdx - jbe L(loop_end) - andq $-(VEC_SIZE * 2), %rdi - subq $-(VEC_SIZE * 4), %rdi - leaq -(VEC_SIZE * 4)(%rax, %rdx), %rcx - .p2align 4 -L(loop): - VMOVA %VEC(0), (%rdi) - VMOVA %VEC(0), VEC_SIZE(%rdi) - VMOVA %VEC(0), (VEC_SIZE * 2)(%rdi) - VMOVA %VEC(0), (VEC_SIZE * 3)(%rdi) - subq $-(VEC_SIZE * 4), %rdi - cmpq %rcx, %rdi - jb L(loop) -L(loop_end): - /* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN. - rdx as length is also unchanged. */ - VMOVU %VEC(0), -(VEC_SIZE * 4)(%rax, %rdx) - VMOVU %VEC(0), -(VEC_SIZE * 3)(%rax, %rdx) - VMOVU %VEC(0), -(VEC_SIZE * 2)(%rax, %rdx) - VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) - VZEROUPPER_SHORT_RETURN - - .p2align 4 + /* If have AVX512 mask instructions put L(less_vec) close to + entry as it doesn't take much space and is likely a hot target. + */ +#ifdef USE_LESS_VEC_MASK_STORE + .p2align 4,, 10 L(less_vec): /* Less than 1 VEC. */ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 # error Unsupported VEC_SIZE! # endif -# ifdef USE_LESS_VEC_MASK_STORE /* Clear high bits from edi. Only keeping bits relevant to page cross check. Note that we are using rax which is set in - MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. - */ + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */ andl $(PAGE_SIZE - 1), %edi - /* Check if VEC_SIZE store cross page. Mask stores suffer serious - performance degradation when it has to fault supress. */ + /* Check if VEC_SIZE store cross page. Mask stores suffer + serious performance degradation when it has to fault supress. + */ cmpl $(PAGE_SIZE - VEC_SIZE), %edi + /* This is generally considered a cold target. */ ja L(cross_page) # if VEC_SIZE > 32 movq $-1, %rcx @@ -247,58 +235,185 @@ L(less_vec): bzhil %edx, %ecx, %ecx kmovd %ecx, %k1 # endif - vmovdqu8 %VEC(0), (%rax) {%k1} + vmovdqu8 %VEC(0), (%rax){%k1} VZEROUPPER_RETURN +# if defined USE_MULTIARCH && IS_IN (libc) + /* Include L(stosb_local) here if including L(less_vec) between + L(stosb_more_2x_vec) and ENTRY. This is to cache align the + L(stosb_more_2x_vec) target. */ + .p2align 4,, 10 +L(stosb_local): + movzbl %sil, %eax + mov %RDX_LP, %RCX_LP + mov %RDI_LP, %RDX_LP + rep stosb + mov %RDX_LP, %RAX_LP + VZEROUPPER_RETURN +# endif +#endif + +#if defined USE_MULTIARCH && IS_IN (libc) .p2align 4 -L(cross_page): +L(stosb_more_2x_vec): + cmp __x86_rep_stosb_threshold(%rip), %RDX_LP + ja L(stosb_local) +#endif + /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x] + and (4x, 8x] jump to target. */ +L(more_2x_vec): + + /* Two different methods of setting up pointers / compare. The + two methods are based on the fact that EVEX/AVX512 mov + instructions take more bytes then AVX2/SSE2 mov instructions. As + well that EVEX/AVX512 machines also have fast LEA_BID. Both + setup and END_REG to avoid complex address mode. For EVEX/AVX512 + this saves code size and keeps a few targets in one fetch block. + For AVX2/SSE2 this helps prevent AGU bottlenecks. */ +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 + /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + + LOOP_4X_OFFSET) with LEA_BID. */ + + /* END_REG is rcx for EVEX/AVX512. */ + leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG +#endif + + /* Stores to first 2x VEC before cmp as any path forward will + require it. */ + VMOVU %VEC(0), (%rax) + VMOVU %VEC(0), VEC_SIZE(%rax) + + +#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512) + /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */ + addq %rdx, %END_REG +#endif + + cmpq $(VEC_SIZE * 4), %rdx + jbe L(last_2x_vec) + + /* Store next 2x vec regardless. */ + VMOVU %VEC(0), (VEC_SIZE * 2)(%rax) + VMOVU %VEC(0), (VEC_SIZE * 3)(%rax) + + +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 + /* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add + extra offset to addresses in loop. Used for AVX512 to save space + as no way to get (VEC_SIZE * 4) in imm8. */ +# if LOOP_4X_OFFSET == 0 + subq $-(VEC_SIZE * 4), %LOOP_REG # endif -# if VEC_SIZE > 32 - cmpb $32, %dl - jae L(between_32_63) + /* Avoid imm32 compare here to save code size. */ + cmpq %rdi, %rcx +#else + addq $-(VEC_SIZE * 4), %END_REG + cmpq $(VEC_SIZE * 8), %rdx +#endif + jbe L(last_4x_vec) +#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512) + /* Set LOOP_REG (rdx). */ + leaq (VEC_SIZE * 4)(%rax), %LOOP_REG +#endif + /* Align dst for loop. */ + andq $(VEC_SIZE * -2), %LOOP_REG + .p2align 4 +L(loop): + VMOVA %VEC(0), LOOP_4X_OFFSET(%LOOP_REG) + VMOVA %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG) + VMOVA %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG) + VMOVA %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG) + subq $-(VEC_SIZE * 4), %LOOP_REG + cmpq %END_REG, %LOOP_REG + jb L(loop) + .p2align 4,, MOV_SIZE +L(last_4x_vec): + VMOVU %VEC(0), LOOP_4X_OFFSET(%END_REG) + VMOVU %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG) + VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG) + VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG) +L(return): +#if VEC_SIZE > 16 + ZERO_UPPER_VEC_REGISTERS_RETURN +#else + ret +#endif + + .p2align 4,, 10 +#ifndef USE_LESS_VEC_MASK_STORE +# if defined USE_MULTIARCH && IS_IN (libc) + /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in + range for 2-byte jump encoding. */ +L(stosb_local): + movzbl %sil, %eax + mov %RDX_LP, %RCX_LP + mov %RDI_LP, %RDX_LP + rep stosb + mov %RDX_LP, %RAX_LP + VZEROUPPER_RETURN # endif -# if VEC_SIZE > 16 - cmpb $16, %dl + /* Define L(less_vec) only if not otherwise defined. */ + .p2align 4 +L(less_vec): +#endif +L(cross_page): +#if VEC_SIZE > 32 + cmpl $32, %edx + jae L(between_32_63) +#endif +#if VEC_SIZE > 16 + cmpl $16, %edx jae L(between_16_31) -# endif - MOVQ %XMM0, %rcx - cmpb $8, %dl +#endif + MOVQ %XMM0, %rdi + cmpl $8, %edx jae L(between_8_15) - cmpb $4, %dl + cmpl $4, %edx jae L(between_4_7) - cmpb $1, %dl + cmpl $1, %edx ja L(between_2_3) - jb 1f - movb %cl, (%rax) -1: + jb L(return) + movb %sil, (%rax) VZEROUPPER_RETURN -# if VEC_SIZE > 32 + + /* Align small targets only if not doing so would cross a fetch + line. */ +#if VEC_SIZE > 32 + .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) /* From 32 to 63. No branch when size == 32. */ L(between_32_63): - VMOVU %YMM0, -32(%rax,%rdx) VMOVU %YMM0, (%rax) + VMOVU %YMM0, -32(%rax, %rdx) VZEROUPPER_RETURN -# endif -# if VEC_SIZE > 16 - /* From 16 to 31. No branch when size == 16. */ +#endif + +#if VEC_SIZE >= 32 + .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) L(between_16_31): - VMOVU %XMM0, -16(%rax,%rdx) + /* From 16 to 31. No branch when size == 16. */ VMOVU %XMM0, (%rax) + VMOVU %XMM0, -16(%rax, %rdx) VZEROUPPER_RETURN -# endif - /* From 8 to 15. No branch when size == 8. */ +#endif + + .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) L(between_8_15): - movq %rcx, -8(%rax,%rdx) - movq %rcx, (%rax) + /* From 8 to 15. No branch when size == 8. */ + movq %rdi, (%rax) + movq %rdi, -8(%rax, %rdx) VZEROUPPER_RETURN + + .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE) L(between_4_7): /* From 4 to 7. No branch when size == 4. */ - movl %ecx, -4(%rax,%rdx) - movl %ecx, (%rax) + movl %edi, (%rax) + movl %edi, -4(%rax, %rdx) VZEROUPPER_RETURN + + .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) L(between_2_3): /* From 2 to 3. No branch when size == 2. */ - movw %cx, -2(%rax,%rdx) - movw %cx, (%rax) + movw %di, (%rax) + movb %dil, -1(%rax, %rdx) VZEROUPPER_RETURN END (MEMSET_SYMBOL (__memset, unaligned_erms)) |