aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/x86_64/multiarch
diff options
context:
space:
mode:
authorNoah Goldstein <goldstein.w.n@gmail.com>2022-10-18 17:44:03 -0700
committerNoah Goldstein <goldstein.w.n@gmail.com>2022-10-19 17:31:03 -0700
commit330881763efff626d6b1cdf8de9ffee4ed7a1ba1 (patch)
treebe5ed6967393bbb1b87d8ac6c1ed3cc1bdef25cd /sysdeps/x86_64/multiarch
parent451c6e58540e8571e31581c04c4829e5d2cfe8ac (diff)
downloadglibc-330881763efff626d6b1cdf8de9ffee4ed7a1ba1.tar
glibc-330881763efff626d6b1cdf8de9ffee4ed7a1ba1.tar.gz
glibc-330881763efff626d6b1cdf8de9ffee4ed7a1ba1.tar.bz2
glibc-330881763efff626d6b1cdf8de9ffee4ed7a1ba1.zip
x86: Optimize memchr-evex.S and implement with VMM headers
Optimizations are: 1. Use the fact that tzcnt(0) -> VEC_SIZE for memchr to save a branch in short string case. 2. Restructure code so that small strings are given the hot path. - This is a net-zero on the benchmark suite but in general makes sense as smaller sizes are far more common. 3. Use more code-size efficient instructions. - tzcnt ... -> bsf ... - vpcmpb $0 ... -> vpcmpeq ... 4. Align labels less aggressively, especially if it doesn't save fetch blocks / causes the basic-block to span extra cache-lines. The optimizations (especially for point 2) make the memchr and rawmemchr code essentially incompatible so split rawmemchr-evex to a new file. Code Size Changes: memchr-evex.S : -107 bytes rawmemchr-evex.S : -53 bytes Net perf changes: Reported as geometric mean of all improvements / regressions from N=10 runs of the benchtests. Value as New Time / Old Time so < 1.0 is improvement and 1.0 is regression. memchr-evex.S : 0.928 rawmemchr-evex.S : 0.986 (Less targets cross cache lines) Full results attached in email. Full check passes on x86-64.
Diffstat (limited to 'sysdeps/x86_64/multiarch')
-rw-r--r--sysdeps/x86_64/multiarch/memchr-evex.S939
-rw-r--r--sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S9
-rw-r--r--sysdeps/x86_64/multiarch/rawmemchr-evex.S313
3 files changed, 851 insertions, 410 deletions
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 0dd4f1dcce..23a1c0018e 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -21,17 +21,27 @@
#if ISA_SHOULD_BUILD (4)
+# ifndef VEC_SIZE
+# include "x86-evex256-vecs.h"
+# endif
+
# ifndef MEMCHR
# define MEMCHR __memchr_evex
# endif
# ifdef USE_AS_WMEMCHR
+# define PC_SHIFT_GPR rcx
+# define VPTESTN vptestnmd
# define VPBROADCAST vpbroadcastd
# define VPMINU vpminud
# define VPCMP vpcmpd
# define VPCMPEQ vpcmpeqd
# define CHAR_SIZE 4
+
+# define USE_WIDE_CHAR
# else
+# define PC_SHIFT_GPR rdi
+# define VPTESTN vptestnmb
# define VPBROADCAST vpbroadcastb
# define VPMINU vpminub
# define VPCMP vpcmpb
@@ -39,534 +49,661 @@
# define CHAR_SIZE 1
# endif
- /* In the 4x loop the RTM and non-RTM versions have data pointer
- off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
- This is represented by BASE_OFFSET. As well because the RTM
- version uses vpcmp which stores a bit per element compared where
- the non-RTM version uses vpcmpeq which stores a bit per byte
- compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
- version. */
-# ifdef USE_IN_RTM
+# include "reg-macros.h"
+
+
+/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
+ doesn't have VEX encoding), use VEX encoding in loop so we
+ can use vpcmpeqb + vptern which is more efficient than the
+ EVEX alternative. */
+# if defined USE_IN_RTM || VEC_SIZE == 64
+# undef COND_VZEROUPPER
+# undef VZEROUPPER_RETURN
+# undef VZEROUPPER
+
+# define COND_VZEROUPPER
+# define VZEROUPPER_RETURN ret
# define VZEROUPPER
-# define BASE_OFFSET (VEC_SIZE * 4)
-# define RET_SCALE CHAR_SIZE
+
+# define USE_TERN_IN_LOOP 0
# else
+# define USE_TERN_IN_LOOP 1
+# undef VZEROUPPER
# define VZEROUPPER vzeroupper
-# define BASE_OFFSET 0
-# define RET_SCALE 1
# endif
- /* In the return from 4x loop memchr and rawmemchr versions have
- data pointers off by VEC_SIZE * 4 with memchr version being
- VEC_SIZE * 4 greater. */
-# ifdef USE_AS_RAWMEMCHR
-# define RET_OFFSET (BASE_OFFSET - (VEC_SIZE * 4))
-# define RAW_PTR_REG rcx
-# define ALGN_PTR_REG rdi
+# if USE_TERN_IN_LOOP
+ /* Resulting bitmask for vpmovmskb has 4-bits set for each wchar
+ so we don't want to multiply resulting index. */
+# define TERN_CHAR_MULT 1
+
+# ifdef USE_AS_WMEMCHR
+# define TEST_END() inc %VRCX
+# else
+# define TEST_END() add %rdx, %rcx
+# endif
# else
-# define RET_OFFSET BASE_OFFSET
-# define RAW_PTR_REG rdi
-# define ALGN_PTR_REG rcx
+# define TERN_CHAR_MULT CHAR_SIZE
+# define TEST_END() KORTEST %k2, %k3
# endif
-# define XMMZERO xmm23
-# define YMMZERO ymm23
-# define XMMMATCH xmm16
-# define YMMMATCH ymm16
-# define YMM1 ymm17
-# define YMM2 ymm18
-# define YMM3 ymm19
-# define YMM4 ymm20
-# define YMM5 ymm21
-# define YMM6 ymm22
+# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+# ifndef USE_AS_WMEMCHR
+# define GPR_X0_IS_RET 1
+# else
+# define GPR_X0_IS_RET 0
+# endif
+# define GPR_X0 rax
+# else
+# define GPR_X0_IS_RET 0
+# define GPR_X0 rdx
+# endif
+
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-# ifndef SECTION
-# define SECTION(p) p##.evex
+# if CHAR_PER_VEC == 64
+# define LAST_VEC_OFFSET (VEC_SIZE * 3)
+# else
+# define LAST_VEC_OFFSET (VEC_SIZE * 2)
+# endif
+# if CHAR_PER_VEC >= 32
+# define MASK_GPR(...) VGPR(__VA_ARGS__)
+# elif CHAR_PER_VEC == 16
+# define MASK_GPR(reg) VGPR_SZ(reg, 16)
+# else
+# define MASK_GPR(reg) VGPR_SZ(reg, 8)
# endif
-# define VEC_SIZE 32
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-# define PAGE_SIZE 4096
+# define VMATCH VMM(0)
+# define VMATCH_LO VMM_lo(0)
- .section SECTION(.text),"ax",@progbits
+# define PAGE_SIZE 4096
+
+
+ .section SECTION(.text), "ax", @progbits
ENTRY_P2ALIGN (MEMCHR, 6)
-# ifndef USE_AS_RAWMEMCHR
/* Check for zero length. */
test %RDX_LP, %RDX_LP
- jz L(zero)
+ jz L(zero_0)
-# ifdef __ILP32__
+# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
-# endif
# endif
- /* Broadcast CHAR to YMMMATCH. */
- VPBROADCAST %esi, %YMMMATCH
+ VPBROADCAST %esi, %VMATCH
/* Check if we may cross page boundary with one vector load. */
movl %edi, %eax
andl $(PAGE_SIZE - 1), %eax
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
- ja L(cross_page_boundary)
+ ja L(page_cross)
+
+ VPCMPEQ (%rdi), %VMATCH, %k0
+ KMOV %k0, %VRAX
+# ifndef USE_AS_WMEMCHR
+ /* If rcx is zero then tzcnt -> CHAR_PER_VEC. NB: there is a
+ already a dependency between rcx and rsi so no worries about
+ false-dep here. */
+ tzcnt %VRAX, %VRSI
+ /* If rdx <= rsi then either 1) rcx was non-zero (there was a
+ match) but it was out of bounds or 2) rcx was zero and rdx
+ was <= VEC_SIZE so we are done scanning. */
+ cmpq %rsi, %rdx
+ /* NB: Use branch to return zero/non-zero. Common usage will
+ branch on result of function (if return is null/non-null).
+ This branch can be used to predict the ensuing one so there
+ is no reason to extend the data-dependency with cmovcc. */
+ jbe L(zero_0)
+
+ /* If rcx is zero then len must be > RDX, otherwise since we
+ already tested len vs lzcnt(rcx) (in rsi) we are good to
+ return this match. */
+ test %VRAX, %VRAX
+ jz L(more_1x_vec)
+ leaq (%rdi, %rsi), %rax
+# else
- /* Check the first VEC_SIZE bytes. */
- VPCMP $0, (%rdi), %YMMMATCH, %k0
- kmovd %k0, %eax
-# ifndef USE_AS_RAWMEMCHR
- /* If length < CHAR_PER_VEC handle special. */
+ /* We can't use the `tzcnt` trick for wmemchr because CHAR_SIZE
+ > 1 so if rcx is tzcnt != CHAR_PER_VEC. */
cmpq $CHAR_PER_VEC, %rdx
- jbe L(first_vec_x0)
-# endif
- testl %eax, %eax
- jz L(aligned_more)
- tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ ja L(more_1x_vec)
+ tzcnt %VRAX, %VRAX
+ cmpl %eax, %edx
+ jbe L(zero_0)
+L(first_vec_x0_ret):
leaq (%rdi, %rax, CHAR_SIZE), %rax
-# else
- addq %rdi, %rax
# endif
ret
-# ifndef USE_AS_RAWMEMCHR
-L(zero):
- xorl %eax, %eax
- ret
-
- .p2align 4
-L(first_vec_x0):
- /* Check if first match was before length. NB: tzcnt has false data-
- dependency on destination. eax already had a data-dependency on esi
- so this should have no affect here. */
- tzcntl %eax, %esi
-# ifdef USE_AS_WMEMCHR
- leaq (%rdi, %rsi, CHAR_SIZE), %rdi
-# else
- addq %rsi, %rdi
-# endif
+ /* Only fits in first cache line for VEC_SIZE == 32. */
+# if VEC_SIZE == 32
+ .p2align 4,, 2
+L(zero_0):
xorl %eax, %eax
- cmpl %esi, %edx
- cmovg %rdi, %rax
ret
# endif
- .p2align 4
-L(cross_page_boundary):
- /* Save pointer before aligning as its original value is
- necessary for computer return address if byte is found or
- adjusting length if it is not and this is memchr. */
- movq %rdi, %rcx
- /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
- for rawmemchr. */
- andq $-VEC_SIZE, %ALGN_PTR_REG
- VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
- kmovd %k0, %r8d
+ .p2align 4,, 9
+L(more_1x_vec):
# ifdef USE_AS_WMEMCHR
- /* NB: Divide shift count by 4 since each bit in K0 represent 4
- bytes. */
- sarl $2, %eax
-# endif
-# ifndef USE_AS_RAWMEMCHR
- movl $(PAGE_SIZE / CHAR_SIZE), %esi
- subl %eax, %esi
+ /* If wmemchr still need to test if there was a match in first
+ VEC. Use bsf to test here so we can reuse
+ L(first_vec_x0_ret). */
+ bsf %VRAX, %VRAX
+ jnz L(first_vec_x0_ret)
# endif
+
+L(page_cross_continue):
# ifdef USE_AS_WMEMCHR
- andl $(CHAR_PER_VEC - 1), %eax
-# endif
- /* Remove the leading bytes. */
- sarxl %eax, %r8d, %eax
-# ifndef USE_AS_RAWMEMCHR
- /* Check the end of data. */
- cmpq %rsi, %rdx
- jbe L(first_vec_x0)
+ /* We can't use end of the buffer to re-calculate length for
+ wmemchr as len * CHAR_SIZE may overflow. */
+ leaq -(VEC_SIZE + CHAR_SIZE)(%rdi), %rax
+ andq $(VEC_SIZE * -1), %rdi
+ subq %rdi, %rax
+ sarq $2, %rax
+ addq %rdx, %rax
+# else
+ leaq -(VEC_SIZE + 1)(%rdx, %rdi), %rax
+ andq $(VEC_SIZE * -1), %rdi
+ subq %rdi, %rax
# endif
- testl %eax, %eax
- jz L(cross_page_continue)
- tzcntl %eax, %eax
+
+ /* rax contains remaining length - 1. -1 so we can get imm8
+ encoding in a few additional places saving code size. */
+
+ /* Needed regardless of remaining length. */
+ VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0
+ KMOV %k0, %VRDX
+
+ /* We cannot fold the above `sub %rdi, %rax` with the `cmp
+ $(CHAR_PER_VEC * 2), %rax` because its possible for a very
+ large length to overflow and cause the subtract to carry
+ despite length being above CHAR_PER_VEC * 2. */
+ cmpq $(CHAR_PER_VEC * 2 - 1), %rax
+ ja L(more_2x_vec)
+L(last_2x_vec):
+
+ test %VRDX, %VRDX
+ jnz L(first_vec_x1_check)
+
+ /* Check the end of data. NB: use 8-bit operations to save code
+ size. We no longer need the full-width of eax and will
+ perform a write-only operation over eax so there will be no
+ partial-register stalls. */
+ subb $(CHAR_PER_VEC * 1 - 1), %al
+ jle L(zero_0)
+
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+ KMOV %k0, %VRCX
# ifdef USE_AS_WMEMCHR
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
- leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+ /* For wmemchr against we can't take advantage of tzcnt(0) ==
+ VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */
+ test %VRCX, %VRCX
+ jz L(zero_0)
+# endif
+ tzcnt %VRCX, %VRCX
+ cmp %cl, %al
+
+ /* Same CFG for VEC_SIZE == 64 and VEC_SIZE == 32. We give
+ fallthrough to L(zero_0) for VEC_SIZE == 64 here as there is
+ not enough space before the next cache line to fit the `lea`
+ for return. */
+# if VEC_SIZE == 64
+ ja L(first_vec_x2_ret)
+L(zero_0):
+ xorl %eax, %eax
+ ret
# else
- addq %RAW_PTR_REG, %rax
+ jbe L(zero_0)
+ leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
+ ret
# endif
+
+ .p2align 4,, 5
+L(first_vec_x1_check):
+ bsf %VRDX, %VRDX
+ cmpb %dl, %al
+ jb L(zero_4)
+ leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
ret
- .p2align 4
-L(first_vec_x1):
- tzcntl %eax, %eax
- leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+ /* Fits at the end of the cache line here for VEC_SIZE == 32.
+ */
+# if VEC_SIZE == 32
+L(zero_4):
+ xorl %eax, %eax
ret
+# endif
- .p2align 4
+
+ .p2align 4,, 4
L(first_vec_x2):
- tzcntl %eax, %eax
- leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ bsf %VRCX, %VRCX
+L(first_vec_x2_ret):
+ leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
ret
- .p2align 4
-L(first_vec_x3):
- tzcntl %eax, %eax
- leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ /* Fits at the end of the cache line here for VEC_SIZE == 64.
+ */
+# if VEC_SIZE == 64
+L(zero_4):
+ xorl %eax, %eax
ret
+# endif
- .p2align 4
-L(first_vec_x4):
- tzcntl %eax, %eax
- leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ .p2align 4,, 4
+L(first_vec_x1):
+ bsf %VRDX, %VRDX
+ leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
ret
- .p2align 5
-L(aligned_more):
- /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
- since data is only aligned to VEC_SIZE. */
-# ifndef USE_AS_RAWMEMCHR
- /* Align data to VEC_SIZE. */
-L(cross_page_continue):
- xorl %ecx, %ecx
- subl %edi, %ecx
- andq $-VEC_SIZE, %rdi
- /* esi is for adjusting length to see if near the end. */
- leal (VEC_SIZE * 5)(%rdi, %rcx), %esi
-# ifdef USE_AS_WMEMCHR
- /* NB: Divide bytes by 4 to get the wchar_t count. */
- sarl $2, %esi
-# endif
-# else
- andq $-VEC_SIZE, %rdi
-L(cross_page_continue):
-# endif
- /* Load first VEC regardless. */
- VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
- kmovd %k0, %eax
-# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. If near end handle specially. */
- subq %rsi, %rdx
- jbe L(last_4x_vec_or_less)
-# endif
- testl %eax, %eax
+ .p2align 4,, 5
+L(more_2x_vec):
+ /* Length > VEC_SIZE * 2 so check first 2x VEC before rechecking
+ length. */
+
+
+ /* Already computed matches for first VEC in rdx. */
+ test %VRDX, %VRDX
jnz L(first_vec_x1)
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
- kmovd %k0, %eax
- testl %eax, %eax
+
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
jnz L(first_vec_x2)
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
- kmovd %k0, %eax
- testl %eax, %eax
+ /* Needed regardless of next length check. */
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
+ KMOV %k0, %VRCX
+
+ /* Check if we are near the end. */
+ cmpq $(CHAR_PER_VEC * 4 - 1), %rax
+ ja L(more_4x_vec)
+
+ test %VRCX, %VRCX
+ jnz L(first_vec_x3_check)
+
+ /* Use 8-bit instructions to save code size. We won't use full-
+ width eax again and will perform a write-only operation to
+ eax so no worries about partial-register stalls. */
+ subb $(CHAR_PER_VEC * 3), %al
+ jb L(zero_2)
+L(last_vec_check):
+ VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
+ KMOV %k0, %VRCX
+# ifdef USE_AS_WMEMCHR
+ /* For wmemchr against we can't take advantage of tzcnt(0) ==
+ VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */
+ test %VRCX, %VRCX
+ jz L(zero_2)
+# endif
+ tzcnt %VRCX, %VRCX
+ cmp %cl, %al
+ jae L(first_vec_x4_ret)
+L(zero_2):
+ xorl %eax, %eax
+ ret
+
+ /* Fits at the end of the cache line here for VEC_SIZE == 64.
+ For VEC_SIZE == 32 we put the return label at the end of
+ L(first_vec_x4). */
+# if VEC_SIZE == 64
+L(first_vec_x4_ret):
+ leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
+ ret
+# endif
+
+ .p2align 4,, 6
+L(first_vec_x4):
+ bsf %VRCX, %VRCX
+# if VEC_SIZE == 32
+ /* Place L(first_vec_x4_ret) here as we can't fit it in the same
+ cache line as where it is called from so we might as well
+ save code size by reusing return of L(first_vec_x4). */
+L(first_vec_x4_ret):
+# endif
+ leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
+ ret
+
+ .p2align 4,, 6
+L(first_vec_x3_check):
+ /* Need to adjust remaining length before checking. */
+ addb $-(CHAR_PER_VEC * 2), %al
+ bsf %VRCX, %VRCX
+ cmpb %cl, %al
+ jb L(zero_2)
+ leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
+ ret
+
+ .p2align 4,, 6
+L(first_vec_x3):
+ bsf %VRCX, %VRCX
+ leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
+ ret
+
+ .p2align 4,, 3
+# if !USE_TERN_IN_LOOP
+ .p2align 4,, 10
+# endif
+L(more_4x_vec):
+ test %VRCX, %VRCX
jnz L(first_vec_x3)
- VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
- kmovd %k0, %eax
- testl %eax, %eax
+ VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
jnz L(first_vec_x4)
+ subq $-(VEC_SIZE * 5), %rdi
+ subq $(CHAR_PER_VEC * 8), %rax
+ jb L(last_4x_vec)
-# ifndef USE_AS_RAWMEMCHR
- /* Check if at last CHAR_PER_VEC * 4 length. */
- subq $(CHAR_PER_VEC * 4), %rdx
- jbe L(last_4x_vec_or_less_cmpeq)
- /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5. */
- addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
-
- /* Align data to VEC_SIZE * 4 for the loop and readjust length.
- */
-# ifdef USE_AS_WMEMCHR
+# ifdef USE_AS_WMEMCHR
movl %edi, %ecx
- andq $-(4 * VEC_SIZE), %rdi
+# else
+ addq %rdi, %rax
+# endif
+
+
+# if VEC_SIZE == 64
+ /* use xorb to do `andq $-(VEC_SIZE * 4), %rdi`. No evex
+ processor has partial register stalls (all have merging
+ uop). If that changes this can be removed. */
+ xorb %dil, %dil
+# else
+ andq $-(VEC_SIZE * 4), %rdi
+# endif
+
+# ifdef USE_AS_WMEMCHR
subl %edi, %ecx
- /* NB: Divide bytes by 4 to get the wchar_t count. */
sarl $2, %ecx
- addq %rcx, %rdx
-# else
- addq %rdi, %rdx
- andq $-(4 * VEC_SIZE), %rdi
- subq %rdi, %rdx
-# endif
+ addq %rcx, %rax
# else
- addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
- andq $-(4 * VEC_SIZE), %rdi
+ subq %rdi, %rax
# endif
-# ifdef USE_IN_RTM
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
-# else
- /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
- encodable with EVEX registers (ymm16-ymm31). */
- vmovdqa64 %YMMMATCH, %ymm0
+
+
+
+# if USE_TERN_IN_LOOP
+ /* copy VMATCH to low ymm so we can use vpcmpeq which is not
+ encodable with EVEX registers. NB: this is VEC_SIZE == 32
+ only as there is no way to encode vpcmpeq with zmm0-15. */
+ vmovdqa64 %VMATCH, %VMATCH_LO
# endif
- /* Compare 4 * VEC at a time forward. */
- .p2align 4
+ .p2align 4,, 11
L(loop_4x_vec):
- /* Two versions of the loop. One that does not require
- vzeroupper by not using ymm0-ymm15 and another does that require
- vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
- is used at all is because there is no EVEX encoding vpcmpeq and
- with vpcmpeq this loop can be performed more efficiently. The
- non-vzeroupper version is safe for RTM while the vzeroupper
- version should be prefered if RTM are not supported. */
-# ifdef USE_IN_RTM
- /* It would be possible to save some instructions using 4x VPCMP
- but bottleneck on port 5 makes it not woth it. */
- VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
- /* xor will set bytes match esi to zero. */
- vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
- vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
- VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
- /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
- VPMINU %YMM2, %YMM3, %YMM3{%k1}{z}
- VPCMP $0, %YMM3, %YMMZERO, %k2
-# else
+ /* Two versions of the loop. One that does not require
+ vzeroupper by not using ymmm0-15 and another does that
+ require vzeroupper because it uses ymmm0-15. The reason why
+ ymm0-15 is used at all is because there is no EVEX encoding
+ vpcmpeq and with vpcmpeq this loop can be performed more
+ efficiently. The non-vzeroupper version is safe for RTM
+ while the vzeroupper version should be prefered if RTM are
+ not supported. Which loop version we use is determined by
+ USE_TERN_IN_LOOP. */
+
+# if USE_TERN_IN_LOOP
/* Since vptern can only take 3x vectors fastest to do 1 vec
seperately with EVEX vpcmp. */
# ifdef USE_AS_WMEMCHR
/* vptern can only accept masks for epi32/epi64 so can only save
- instruction using not equals mask on vptern with wmemchr. */
- VPCMP $4, (%rdi), %YMMMATCH, %k1
+ instruction using not equals mask on vptern with wmemchr.
+ */
+ VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
# else
- VPCMP $0, (%rdi), %YMMMATCH, %k1
+ VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
# endif
/* Compare 3x with vpcmpeq and or them all together with vptern.
*/
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
+ VPCMPEQ (VEC_SIZE * 1)(%rdi), %VMATCH_LO, %VMM_lo(2)
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
# ifdef USE_AS_WMEMCHR
- /* This takes the not of or between ymm2, ymm3, ymm4 as well as
- combines result from VEC0 with zero mask. */
- vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
- vpmovmskb %ymm4, %ecx
+ /* This takes the not of or between VEC_lo(2), VEC_lo(3),
+ VEC_lo(4) as well as combines result from VEC(0) with zero
+ mask. */
+ vpternlogd $1, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4){%k1}{z}
+ vpmovmskb %VMM_lo(4), %VRCX
# else
- /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4. */
- vpternlogd $254, %ymm2, %ymm3, %ymm4
- vpmovmskb %ymm4, %ecx
- kmovd %k1, %eax
+ /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
+ VEC_lo(4). */
+ vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
+ vpmovmskb %VMM_lo(4), %VRCX
+ KMOV %k1, %edx
# endif
-# endif
-# ifdef USE_AS_RAWMEMCHR
- subq $-(VEC_SIZE * 4), %rdi
-# endif
-# ifdef USE_IN_RTM
- kortestd %k2, %k3
# else
-# ifdef USE_AS_WMEMCHR
- /* ecx contains not of matches. All 1s means no matches. incl will
- overflow and set zeroflag if that is the case. */
- incl %ecx
-# else
- /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
- to ecx is not an issue because if eax is non-zero it will be
- used for returning the match. If it is zero the add does
- nothing. */
- addq %rax, %rcx
-# endif
+ /* Loop version that uses EVEX encoding. */
+ VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
+ vpxorq (VEC_SIZE * 1)(%rdi), %VMATCH, %VMM(2)
+ vpxorq (VEC_SIZE * 2)(%rdi), %VMATCH, %VMM(3)
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k3
+ VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z}
+ VPTESTN %VMM(3), %VMM(3), %k2
# endif
-# ifdef USE_AS_RAWMEMCHR
- jz L(loop_4x_vec)
-# else
- jnz L(loop_4x_vec_end)
+
+
+ TEST_END ()
+ jnz L(loop_vec_ret)
subq $-(VEC_SIZE * 4), %rdi
- subq $(CHAR_PER_VEC * 4), %rdx
- ja L(loop_4x_vec)
+ subq $(CHAR_PER_VEC * 4), %rax
+ jae L(loop_4x_vec)
- /* Fall through into less than 4 remaining vectors of length case.
+ /* COND_VZEROUPPER is vzeroupper if we use the VEX encoded loop.
*/
- VPCMP $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
- addq $(BASE_OFFSET - VEC_SIZE), %rdi
- kmovd %k0, %eax
- VZEROUPPER
-
-L(last_4x_vec_or_less):
- /* Check if first VEC contained match. */
- testl %eax, %eax
- jnz L(first_vec_x1_check)
+ COND_VZEROUPPER
- /* If remaining length > CHAR_PER_VEC * 2. */
- addl $(CHAR_PER_VEC * 2), %edx
- jg L(last_4x_vec)
-
-L(last_2x_vec):
- /* If remaining length < CHAR_PER_VEC. */
- addl $CHAR_PER_VEC, %edx
- jle L(zero_end)
-
- /* Check VEC2 and compare any match with remaining length. */
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
- kmovd %k0, %eax
- tzcntl %eax, %eax
- cmpl %eax, %edx
- jbe L(set_zero_end)
- leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
-L(zero_end):
- ret
+ .p2align 4,, 10
+L(last_4x_vec):
+ /* For CHAR_PER_VEC == 64 we don't need to mask as we use 8-bit
+ instructions on eax from here on out. */
+# if CHAR_PER_VEC != 64
+ andl $(CHAR_PER_VEC * 4 - 1), %eax
+# endif
+ VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k0
+ subq $(VEC_SIZE * 1), %rdi
+ KMOV %k0, %VRDX
+ cmpb $(CHAR_PER_VEC * 2 - 1), %al
+ jbe L(last_2x_vec)
+ test %VRDX, %VRDX
+ jnz L(last_vec_x1_novzero)
+
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+ jnz L(last_vec_x2_novzero)
+
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(first_vec_x3_check)
+
+ subb $(CHAR_PER_VEC * 3), %al
+ jae L(last_vec_check)
-L(set_zero_end):
xorl %eax, %eax
ret
- .p2align 4
-L(first_vec_x1_check):
- /* eax must be non-zero. Use bsfl to save code size. */
- bsfl %eax, %eax
- /* Adjust length. */
- subl $-(CHAR_PER_VEC * 4), %edx
- /* Check if match within remaining length. */
- cmpl %eax, %edx
- jbe L(set_zero_end)
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
- leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+# if defined USE_AS_WMEMCHR && USE_TERN_IN_LOOP
+L(last_vec_x2_novzero):
+ addq $VEC_SIZE, %rdi
+L(last_vec_x1_novzero):
+ bsf %VRDX, %VRDX
+ leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
ret
+# endif
- .p2align 4
-L(loop_4x_vec_end):
+# if CHAR_PER_VEC == 64
+ /* Since we can't combine the last 2x VEC when CHAR_PER_VEC ==
+ 64 it needs a seperate return label. */
+ .p2align 4,, 4
+L(last_vec_x2):
+L(last_vec_x2_novzero):
+ bsf %VRDX, %VRDX
+ leaq (VEC_SIZE * 2)(%rdi, %rdx, TERN_CHAR_MULT), %rax
+ ret
# endif
- /* rawmemchr will fall through into this if match was found in
- loop. */
-# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
- /* k1 has not of matches with VEC1. */
- kmovd %k1, %eax
-# ifdef USE_AS_WMEMCHR
- subl $((1 << CHAR_PER_VEC) - 1), %eax
-# else
- incl %eax
-# endif
+ .p2align 4,, 4
+L(loop_vec_ret):
+# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+ KMOV %k1, %VRAX
+ inc %MASK_GPR(rax)
# else
- /* eax already has matches for VEC1. */
- testl %eax, %eax
+ test %VRDX, %VRDX
# endif
- jnz L(last_vec_x1_return)
+ jnz L(last_vec_x0)
-# ifdef USE_IN_RTM
- VPCMP $0, %YMM2, %YMMZERO, %k0
- kmovd %k0, %eax
+
+# if USE_TERN_IN_LOOP
+ vpmovmskb %VMM_lo(2), %VRDX
# else
- vpmovmskb %ymm2, %eax
+ VPTESTN %VMM(2), %VMM(2), %k1
+ KMOV %k1, %VRDX
# endif
- testl %eax, %eax
- jnz L(last_vec_x2_return)
+ test %VRDX, %VRDX
+ jnz L(last_vec_x1)
-# ifdef USE_IN_RTM
- kmovd %k2, %eax
- testl %eax, %eax
- jnz L(last_vec_x3_return)
- kmovd %k3, %eax
- tzcntl %eax, %eax
- leaq (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+# if USE_TERN_IN_LOOP
+ vpmovmskb %VMM_lo(3), %VRDX
# else
- vpmovmskb %ymm3, %eax
- /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx). */
- salq $VEC_SIZE, %rcx
- orq %rcx, %rax
- tzcntq %rax, %rax
- leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
- VZEROUPPER
+ KMOV %k2, %VRDX
# endif
- ret
- .p2align 4,, 10
-L(last_vec_x1_return):
- tzcntl %eax, %eax
-# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
- leaq RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
+ /* No longer need any of the lo vecs (ymm0-15) so vzeroupper
+ (only if used VEX encoded loop). */
+ COND_VZEROUPPER
+
+ /* Seperate logic for CHAR_PER_VEC == 64 vs the rest. For
+ CHAR_PER_VEC we test the last 2x VEC seperately, for
+ CHAR_PER_VEC <= 32 we can combine the results from the 2x
+ VEC in a single GPR. */
+# if CHAR_PER_VEC == 64
+# if USE_TERN_IN_LOOP
+# error "Unsupported"
+# endif
+
+
+ /* If CHAR_PER_VEC == 64 we can't combine the last two VEC. */
+ test %VRDX, %VRDX
+ jnz L(last_vec_x2)
+ KMOV %k3, %VRDX
# else
- addq %rdi, %rax
+ /* CHAR_PER_VEC <= 32 so we can combine the results from the
+ last 2x VEC. */
+
+# if !USE_TERN_IN_LOOP
+ KMOV %k3, %VRCX
+# endif
+ salq $(VEC_SIZE / TERN_CHAR_MULT), %rcx
+ addq %rcx, %rdx
+# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+L(last_vec_x2_novzero):
+# endif
# endif
- VZEROUPPER
+ bsf %rdx, %rdx
+ leaq (LAST_VEC_OFFSET)(%rdi, %rdx, TERN_CHAR_MULT), %rax
ret
- .p2align 4
-L(last_vec_x2_return):
- tzcntl %eax, %eax
- /* NB: Multiply bytes by RET_SCALE to get the wchar_t count
- if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
- USE_IN_RTM are both defined. Otherwise RET_SCALE = 1. */
- leaq (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
- VZEROUPPER
+ .p2align 4,, 8
+L(last_vec_x1):
+ COND_VZEROUPPER
+# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+L(last_vec_x1_novzero):
+# endif
+ bsf %VRDX, %VRDX
+ leaq (VEC_SIZE * 1)(%rdi, %rdx, TERN_CHAR_MULT), %rax
ret
-# ifdef USE_IN_RTM
- .p2align 4
-L(last_vec_x3_return):
- tzcntl %eax, %eax
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
- leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+
+ .p2align 4,, 4
+L(last_vec_x0):
+ COND_VZEROUPPER
+ bsf %VGPR(GPR_X0), %VGPR(GPR_X0)
+# if GPR_X0_IS_RET
+ addq %rdi, %rax
+# else
+ leaq (%rdi, %GPR_X0, CHAR_SIZE), %rax
+# endif
ret
+
+ .p2align 4,, 6
+L(page_cross):
+ /* Need to preserve eax to compute inbound bytes we are
+ checking. */
+# ifdef USE_AS_WMEMCHR
+ movl %eax, %ecx
+# else
+ xorl %ecx, %ecx
+ subl %eax, %ecx
# endif
-# ifndef USE_AS_RAWMEMCHR
- .p2align 4,, 5
-L(last_4x_vec_or_less_cmpeq):
- VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
- kmovd %k0, %eax
- subq $-(VEC_SIZE * 4), %rdi
- /* Check first VEC regardless. */
- testl %eax, %eax
- jnz L(first_vec_x1_check)
+ xorq %rdi, %rax
+ VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
+ KMOV %k0, %VRAX
- /* If remaining length <= CHAR_PER_VEC * 2. */
- addl $(CHAR_PER_VEC * 2), %edx
- jle L(last_2x_vec)
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide by CHAR_SIZE to shift out out of bounds bytes. */
+ shrl $2, %ecx
+ andl $(CHAR_PER_VEC - 1), %ecx
+# endif
- .p2align 4
-L(last_4x_vec):
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
- kmovd %k0, %eax
- testl %eax, %eax
- jnz L(last_vec_x2)
+ shrx %VGPR(PC_SHIFT_GPR), %VRAX, %VRAX
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
- kmovd %k0, %eax
- /* Create mask for possible matches within remaining length. */
-# ifdef USE_AS_WMEMCHR
- movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
- bzhil %edx, %ecx, %ecx
-# else
- movq $-1, %rcx
- bzhiq %rdx, %rcx, %rcx
-# endif
- /* Test matches in data against length match. */
- andl %ecx, %eax
- jnz L(last_vec_x3)
+# ifdef USE_AS_WMEMCHR
+ negl %ecx
+# endif
- /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
- remaining length was found to be > CHAR_PER_VEC * 2. */
- subl $CHAR_PER_VEC, %edx
- jbe L(zero_end2)
+ /* mask lower bits from ecx (negative eax) to get bytes till
+ next VEC. */
+ andl $(CHAR_PER_VEC - 1), %ecx
+ /* Check if VEC is entirely contained in the remainder of the
+ page. */
+ cmpq %rcx, %rdx
+ jbe L(page_cross_ret)
- VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
- kmovd %k0, %eax
- /* Shift remaining length mask for last VEC. */
-# ifdef USE_AS_WMEMCHR
- shrl $CHAR_PER_VEC, %ecx
-# else
- shrq $CHAR_PER_VEC, %rcx
-# endif
- andl %ecx, %eax
- jz L(zero_end2)
- bsfl %eax, %eax
- leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
-L(zero_end2):
- ret
+ /* Length crosses the page so if rax is zero (no matches)
+ continue. */
+ test %VRAX, %VRAX
+ jz L(page_cross_continue)
-L(last_vec_x2):
- tzcntl %eax, %eax
- leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ /* if rdx > rcx then any match here must be in [buf:buf + len].
+ */
+ tzcnt %VRAX, %VRAX
+# ifdef USE_AS_WMEMCHR
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
+ addq %rdi, %rax
+# endif
ret
- .p2align 4
-L(last_vec_x3):
- tzcntl %eax, %eax
- leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ .p2align 4,, 2
+L(page_cross_zero):
+ xorl %eax, %eax
ret
+
+ .p2align 4,, 4
+L(page_cross_ret):
+ /* Search is entirely contained in page cross case. */
+# ifdef USE_AS_WMEMCHR
+ test %VRAX, %VRAX
+ jz L(page_cross_zero)
+# endif
+ tzcnt %VRAX, %VRAX
+ cmpl %eax, %edx
+ jbe L(page_cross_zero)
+# ifdef USE_AS_WMEMCHR
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
+ addq %rdi, %rax
# endif
- /* 7 bytes from next cache line. */
+ ret
END (MEMCHR)
#endif
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
index deda1ca395..2073eaa620 100644
--- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
@@ -1,3 +1,6 @@
-#define MEMCHR __rawmemchr_evex_rtm
-#define USE_AS_RAWMEMCHR 1
-#include "memchr-evex-rtm.S"
+#define RAWMEMCHR __rawmemchr_evex_rtm
+
+#define USE_IN_RTM 1
+#define SECTION(p) p##.evex.rtm
+
+#include "rawmemchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
index dc1c450699..dad54def2b 100644
--- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
@@ -1,7 +1,308 @@
-#ifndef RAWMEMCHR
-# define RAWMEMCHR __rawmemchr_evex
-#endif
-#define USE_AS_RAWMEMCHR 1
-#define MEMCHR RAWMEMCHR
+/* rawmemchr optimized with 256-bit EVEX instructions.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <isa-level.h>
+#include <sysdep.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# ifndef VEC_SIZE
+# include "x86-evex256-vecs.h"
+# endif
+
+# ifndef RAWMEMCHR
+# define RAWMEMCHR __rawmemchr_evex
+# endif
+
+
+# define PC_SHIFT_GPR rdi
+# define REG_WIDTH VEC_SIZE
+# define VPTESTN vptestnmb
+# define VPBROADCAST vpbroadcastb
+# define VPMINU vpminub
+# define VPCMP vpcmpb
+# define VPCMPEQ vpcmpeqb
+# define CHAR_SIZE 1
+
+# include "reg-macros.h"
+
+/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
+ doesn't have VEX encoding), use VEX encoding in loop so we
+ can use vpcmpeqb + vptern which is more efficient than the
+ EVEX alternative. */
+# if defined USE_IN_RTM || VEC_SIZE == 64
+# undef COND_VZEROUPPER
+# undef VZEROUPPER_RETURN
+# undef VZEROUPPER
+
+
+# define COND_VZEROUPPER
+# define VZEROUPPER_RETURN ret
+# define VZEROUPPER
+
+# define USE_TERN_IN_LOOP 0
+# else
+# define USE_TERN_IN_LOOP 1
+# undef VZEROUPPER
+# define VZEROUPPER vzeroupper
+# endif
+
+# define CHAR_PER_VEC VEC_SIZE
+
+# if CHAR_PER_VEC == 64
+
+# define TAIL_RETURN_LBL first_vec_x2
+# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2)
+
+# define FALLTHROUGH_RETURN_LBL first_vec_x3
+# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
+
+# else /* !(CHAR_PER_VEC == 64) */
+
+# define TAIL_RETURN_LBL first_vec_x3
+# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3)
+
+# define FALLTHROUGH_RETURN_LBL first_vec_x2
+# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
+# endif /* !(CHAR_PER_VEC == 64) */
+
+
+# define VMATCH VMM(0)
+# define VMATCH_LO VMM_lo(0)
+
+# define PAGE_SIZE 4096
+
+ .section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (RAWMEMCHR, 6)
+ VPBROADCAST %esi, %VMATCH
+ /* Check if we may cross page boundary with one vector load. */
+ movl %edi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(page_cross)
+
+ VPCMPEQ (%rdi), %VMATCH, %k0
+ KMOV %k0, %VRAX
+
+ test %VRAX, %VRAX
+ jz L(aligned_more)
+L(first_vec_x0):
+ bsf %VRAX, %VRAX
+ addq %rdi, %rax
+ ret
+
+ .p2align 4,, 4
+L(first_vec_x4):
+ bsf %VRAX, %VRAX
+ leaq (VEC_SIZE * 4)(%rdi, %rax), %rax
+ ret
-#include "memchr-evex.S"
+ /* For VEC_SIZE == 32 we can fit this in aligning bytes so might
+ as well place it more locally. For VEC_SIZE == 64 we reuse
+ return code at the end of loop's return. */
+# if VEC_SIZE == 32
+ .p2align 4,, 4
+L(FALLTHROUGH_RETURN_LBL):
+ bsf %VRAX, %VRAX
+ leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
+ ret
+# endif
+
+ .p2align 4,, 6
+L(page_cross):
+ /* eax has lower page-offset bits of rdi so xor will zero them
+ out. */
+ xorq %rdi, %rax
+ VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
+ KMOV %k0, %VRAX
+
+ /* Shift out out-of-bounds matches. */
+ shrx %VRDI, %VRAX, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x0)
+
+ .p2align 4,, 10
+L(aligned_more):
+L(page_cross_continue):
+ /* Align pointer. */
+ andq $(VEC_SIZE * -1), %rdi
+
+ VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x1)
+
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x2)
+
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x3)
+
+ VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x4)
+
+ subq $-(VEC_SIZE * 1), %rdi
+# if VEC_SIZE == 64
+ /* Saves code size. No evex512 processor has partial register
+ stalls. If that change this can be replaced with `andq
+ $-(VEC_SIZE * 4), %rdi`. */
+ xorb %dil, %dil
+# else
+ andq $-(VEC_SIZE * 4), %rdi
+# endif
+
+# if USE_TERN_IN_LOOP
+ /* copy VMATCH to low ymm so we can use vpcmpeq which is not
+ encodable with EVEX registers. NB: this is VEC_SIZE == 32
+ only as there is no way to encode vpcmpeq with zmm0-15. */
+ vmovdqa64 %VMATCH, %VMATCH_LO
+# endif
+
+ .p2align 4
+L(loop_4x_vec):
+ /* Two versions of the loop. One that does not require
+ vzeroupper by not using ymm0-15 and another does that
+ require vzeroupper because it uses ymm0-15. The reason why
+ ymm0-15 is used at all is because there is no EVEX encoding
+ vpcmpeq and with vpcmpeq this loop can be performed more
+ efficiently. The non-vzeroupper version is safe for RTM
+ while the vzeroupper version should be prefered if RTM are
+ not supported. Which loop version we use is determined by
+ USE_TERN_IN_LOOP. */
+
+# if USE_TERN_IN_LOOP
+ /* Since vptern can only take 3x vectors fastest to do 1 vec
+ seperately with EVEX vpcmp. */
+ VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
+ /* Compare 3x with vpcmpeq and or them all together with vptern.
+ */
+
+ VPCMPEQ (VEC_SIZE * 5)(%rdi), %VMATCH_LO, %VMM_lo(2)
+ subq $(VEC_SIZE * -4), %rdi
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
+
+ /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
+ VEC_lo(4). */
+ vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
+ vpmovmskb %VMM_lo(4), %VRCX
+
+ KMOV %k1, %eax
+
+ /* NB: rax has match from first VEC and rcx has matches from
+ VEC 2-4. If rax is non-zero we will return that match. If
+ rax is zero adding won't disturb the bits in rcx. */
+ add %rax, %rcx
+# else
+ /* Loop version that uses EVEX encoding. */
+ VPCMP $4, (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
+ vpxorq (VEC_SIZE * 5)(%rdi), %VMATCH, %VMM(2)
+ vpxorq (VEC_SIZE * 6)(%rdi), %VMATCH, %VMM(3)
+ VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMATCH, %k3
+ VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z}
+ VPTESTN %VMM(3), %VMM(3), %k2
+ subq $(VEC_SIZE * -4), %rdi
+ KORTEST %k2, %k3
+# endif
+ jz L(loop_4x_vec)
+
+# if USE_TERN_IN_LOOP
+ test %VRAX, %VRAX
+# else
+ KMOV %k1, %VRAX
+ inc %VRAX
+# endif
+ jnz L(last_vec_x0)
+
+
+# if USE_TERN_IN_LOOP
+ vpmovmskb %VMM_lo(2), %VRAX
+# else
+ VPTESTN %VMM(2), %VMM(2), %k1
+ KMOV %k1, %VRAX
+# endif
+ test %VRAX, %VRAX
+ jnz L(last_vec_x1)
+
+
+# if USE_TERN_IN_LOOP
+ vpmovmskb %VMM_lo(3), %VRAX
+# else
+ KMOV %k2, %VRAX
+# endif
+
+ /* No longer need any of the lo vecs (ymm0-15) so vzeroupper
+ (only if used VEX encoded loop). */
+ COND_VZEROUPPER
+
+ /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
+ returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
+ individually, for VEC_SIZE == 32 we combine them in a single
+ 64-bit GPR. */
+# if CHAR_PER_VEC == 64
+# if USE_TERN_IN_LOOP
+# error "Unsupported"
+# endif
+
+
+ /* If CHAR_PER_VEC == 64 we can't combine the last two VEC. */
+ test %VRAX, %VRAX
+ jnz L(first_vec_x2)
+ KMOV %k3, %VRAX
+L(FALLTHROUGH_RETURN_LBL):
+# else
+ /* CHAR_PER_VEC <= 32 so we can combine the results from the
+ last 2x VEC. */
+# if !USE_TERN_IN_LOOP
+ KMOV %k3, %VRCX
+# endif
+ salq $CHAR_PER_VEC, %rcx
+ addq %rcx, %rax
+# endif
+ bsf %rax, %rax
+ leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
+ ret
+
+ .p2align 4,, 8
+L(TAIL_RETURN_LBL):
+ bsf %rax, %rax
+ leaq (TAIL_RETURN_OFFSET)(%rdi, %rax), %rax
+ ret
+
+ .p2align 4,, 8
+L(last_vec_x1):
+ COND_VZEROUPPER
+L(first_vec_x1):
+ bsf %VRAX, %VRAX
+ leaq (VEC_SIZE * 1)(%rdi, %rax), %rax
+ ret
+
+ .p2align 4,, 8
+L(last_vec_x0):
+ COND_VZEROUPPER
+ bsf %VRAX, %VRAX
+ addq %rdi, %rax
+ ret
+END (RAWMEMCHR)
+#endif