diff options
Diffstat (limited to 'sysdeps/x86_64')
6 files changed, 43 insertions, 99 deletions
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S index 8ac3e479bb..bc8605faf3 100644 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S @@ -1,10 +1,6 @@ -#define ZERO_UPPER_VEC_REGISTERS_RETURN \ - ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST +#include "x86-avx-rtm-vecs.h" -#define VZEROUPPER_RETURN jmp L(return) - -#define SECTION(p) p##.avx.rtm #define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm #define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm -#include "memset-avx2-unaligned-erms.S" +# include "memset-avx2-unaligned-erms.S" diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S index a9054a9122..47cf5072a4 100644 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S @@ -4,14 +4,9 @@ # define USE_WITH_AVX2 1 -# define VEC_SIZE 32 -# define MOV_SIZE 4 -# define RET_SIZE 4 - -# define VEC(i) ymm##i - -# define VMOVU vmovdqu -# define VMOVA vmovdqa +# ifndef VEC_SIZE +# include "x86-avx-vecs.h" +# endif # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ vmovd d, %xmm0; \ @@ -26,9 +21,6 @@ # define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0 # define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0 -# ifndef SECTION -# define SECTION(p) p##.avx -# endif # ifndef MEMSET_SYMBOL # define MEMSET_SYMBOL(p,s) p##_avx2_##s # endif diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S index 47623b8ee8..84145b6c27 100644 --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S @@ -4,26 +4,14 @@ # define USE_WITH_AVX512 1 -# define VEC_SIZE 64 -# define MOV_SIZE 6 -# define RET_SIZE 1 - -# define XMM0 xmm16 -# define YMM0 ymm16 -# define VEC0 zmm16 -# define VEC(i) VEC##i - -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 - -# define VZEROUPPER +# include "x86-evex512-vecs.h" # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ - vpbroadcastb d, %VEC0; \ + vpbroadcastb d, %VMM(0); \ movq r, %rax # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ - vpbroadcastd d, %VEC0; \ + vpbroadcastd d, %VMM(0); \ movq r, %rax # define MEMSET_VDUP_TO_VEC0_HIGH() @@ -32,8 +20,6 @@ # define WMEMSET_VDUP_TO_VEC0_HIGH() # define WMEMSET_VDUP_TO_VEC0_LOW() -# define SECTION(p) p##.evex512 - #ifndef MEMSET_SYMBOL # define MEMSET_SYMBOL(p,s) p##_avx512_##s #endif diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S index ac4b2d2d50..1f03b26bf8 100644 --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S @@ -4,26 +4,14 @@ # define USE_WITH_EVEX 1 -# define VEC_SIZE 32 -# define MOV_SIZE 6 -# define RET_SIZE 1 - -# define XMM0 xmm16 -# define YMM0 ymm16 -# define VEC0 ymm16 -# define VEC(i) VEC##i - -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 - -# define VZEROUPPER +# include "x86-evex256-vecs.h" # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ - vpbroadcastb d, %VEC0; \ + vpbroadcastb d, %VMM(0); \ movq r, %rax # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ - vpbroadcastd d, %VEC0; \ + vpbroadcastd d, %VMM(0); \ movq r, %rax # define MEMSET_VDUP_TO_VEC0_HIGH() @@ -32,8 +20,6 @@ # define WMEMSET_VDUP_TO_VEC0_HIGH() # define WMEMSET_VDUP_TO_VEC0_LOW() -# define SECTION(p) p##.evex - #ifndef MEMSET_SYMBOL # define MEMSET_SYMBOL(p,s) p##_evex_##s #endif diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S index 44f9b8888b..34b245d8ca 100644 --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S @@ -26,13 +26,7 @@ # include <sysdep.h> # define USE_WITH_SSE2 1 -# define VEC_SIZE 16 -# define MOV_SIZE 3 -# define RET_SIZE 1 - -# define VEC(i) xmm##i -# define VMOVU movups -# define VMOVA movaps +# include "x86-sse2-vecs.h" # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ movd d, %xmm0; \ @@ -52,8 +46,6 @@ # define WMEMSET_VDUP_TO_VEC0_HIGH() # define WMEMSET_VDUP_TO_VEC0_LOW() -# define SECTION(p) p - # ifndef MEMSET_SYMBOL # define MEMSET_SYMBOL(p,s) p##_sse2_##s # endif diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index 905d0fa464..03de0ab907 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -34,14 +34,6 @@ # define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s) #endif -#ifndef XMM0 -# define XMM0 xmm0 -#endif - -#ifndef YMM0 -# define YMM0 ymm0 -#endif - #ifndef VZEROUPPER # if VEC_SIZE > 16 # define VZEROUPPER vzeroupper @@ -150,8 +142,8 @@ L(entry_from_wmemset): cmpq $(VEC_SIZE * 2), %rdx ja L(more_2x_vec) /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ - VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) - VMOVU %VEC(0), (%rdi) + VMOVU %VMM(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VMM(0), (%rdi) VZEROUPPER_RETURN #if defined USE_MULTIARCH && IS_IN (libc) END (MEMSET_SYMBOL (__memset, unaligned)) @@ -175,19 +167,19 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) cmp $(VEC_SIZE * 2), %RDX_LP ja L(stosb_more_2x_vec) /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ - VMOVU %VEC(0), (%rdi) - VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) + VMOVU %VMM(0), (%rdi) + VMOVU %VMM(0), (VEC_SIZE * -1)(%rdi, %rdx) VZEROUPPER_RETURN #endif .p2align 4,, 4 L(last_2x_vec): #ifdef USE_LESS_VEC_MASK_STORE - VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx) - VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) + VMOVU %VMM(0), (VEC_SIZE * -2)(%rdi, %rdx) + VMOVU %VMM(0), (VEC_SIZE * -1)(%rdi, %rdx) #else - VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi) - VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi) + VMOVU %VMM(0), (VEC_SIZE * -2)(%rdi) + VMOVU %VMM(0), (VEC_SIZE * -1)(%rdi) #endif VZEROUPPER_RETURN @@ -221,7 +213,7 @@ L(less_vec_from_wmemset): bzhil %edx, %ecx, %ecx kmovd %ecx, %k1 # endif - vmovdqu8 %VEC(0), (%rax){%k1} + vmovdqu8 %VMM(0), (%rax){%k1} VZEROUPPER_RETURN # if defined USE_MULTIARCH && IS_IN (libc) @@ -249,8 +241,8 @@ L(stosb_more_2x_vec): and (4x, 8x] jump to target. */ L(more_2x_vec): /* Store next 2x vec regardless. */ - VMOVU %VEC(0), (%rdi) - VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi) + VMOVU %VMM(0), (%rdi) + VMOVU %VMM(0), (VEC_SIZE * 1)(%rdi) /* Two different methods of setting up pointers / compare. The two @@ -278,8 +270,8 @@ L(more_2x_vec): #endif /* Store next 2x vec regardless. */ - VMOVU %VEC(0), (VEC_SIZE * 2)(%rax) - VMOVU %VEC(0), (VEC_SIZE * 3)(%rax) + VMOVU %VMM(0), (VEC_SIZE * 2)(%rax) + VMOVU %VMM(0), (VEC_SIZE * 3)(%rax) #if defined USE_WITH_EVEX || defined USE_WITH_AVX512 @@ -304,20 +296,20 @@ L(more_2x_vec): andq $(VEC_SIZE * -2), %LOOP_REG .p2align 4 L(loop): - VMOVA %VEC(0), LOOP_4X_OFFSET(%LOOP_REG) - VMOVA %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG) - VMOVA %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG) - VMOVA %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG) + VMOVA %VMM(0), LOOP_4X_OFFSET(%LOOP_REG) + VMOVA %VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG) + VMOVA %VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG) + VMOVA %VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG) subq $-(VEC_SIZE * 4), %LOOP_REG cmpq %END_REG, %LOOP_REG jb L(loop) .p2align 4,, MOV_SIZE L(last_4x_vec): - VMOVU %VEC(0), LOOP_4X_OFFSET(%END_REG) - VMOVU %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG) - VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG) - VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG) -L(return): + VMOVU %VMM(0), LOOP_4X_OFFSET(%END_REG) + VMOVU %VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG) + VMOVU %VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG) + VMOVU %VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG) +L(return_vzeroupper): #if VEC_SIZE > 16 ZERO_UPPER_VEC_REGISTERS_RETURN #else @@ -355,7 +347,7 @@ L(cross_page): jge L(between_16_31) #endif #ifndef USE_XMM_LESS_VEC - MOVQ %XMM0, %SET_REG64 + MOVQ %VMM_128(0), %SET_REG64 #endif cmpl $8, %edx jge L(between_8_15) @@ -374,8 +366,8 @@ L(between_0_0): .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) /* From 32 to 63. No branch when size == 32. */ L(between_32_63): - VMOVU %YMM0, (%LESS_VEC_REG) - VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx) + VMOVU %VMM_256(0), (%LESS_VEC_REG) + VMOVU %VMM_256(0), -32(%LESS_VEC_REG, %rdx) VZEROUPPER_RETURN #endif @@ -383,8 +375,8 @@ L(between_32_63): .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1) L(between_16_31): /* From 16 to 31. No branch when size == 16. */ - VMOVU %XMM0, (%LESS_VEC_REG) - VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx) + VMOVU %VMM_128(0), (%LESS_VEC_REG) + VMOVU %VMM_128(0), -16(%LESS_VEC_REG, %rdx) ret #endif @@ -394,8 +386,8 @@ L(between_16_31): L(between_8_15): /* From 8 to 15. No branch when size == 8. */ #ifdef USE_XMM_LESS_VEC - MOVQ %XMM0, (%rdi) - MOVQ %XMM0, -8(%rdi, %rdx) + MOVQ %VMM_128(0), (%rdi) + MOVQ %VMM_128(0), -8(%rdi, %rdx) #else movq %SET_REG64, (%LESS_VEC_REG) movq %SET_REG64, -8(%LESS_VEC_REG, %rdx) @@ -408,8 +400,8 @@ L(between_8_15): L(between_4_7): /* From 4 to 7. No branch when size == 4. */ #ifdef USE_XMM_LESS_VEC - MOVD %XMM0, (%rdi) - MOVD %XMM0, -4(%rdi, %rdx) + MOVD %VMM_128(0), (%rdi) + MOVD %VMM_128(0), -4(%rdi, %rdx) #else movl %SET_REG32, (%LESS_VEC_REG) movl %SET_REG32, -4(%LESS_VEC_REG, %rdx) |