aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Sterrett <matthew.sterrett@intel.com>2023-12-15 12:04:05 -0800
committerNoah Goldstein <goldstein.w.n@gmail.com>2023-12-18 12:38:01 -0600
commite957308723ac2e55dad360d602298632980bbd38 (patch)
tree75b99dddc0746f3e950e43eda54c51449dbbe612
parent442983319ba70de801fc856e8dd4748fba8f7f1b (diff)
downloadglibc-e957308723ac2e55dad360d602298632980bbd38.tar
glibc-e957308723ac2e55dad360d602298632980bbd38.tar.gz
glibc-e957308723ac2e55dad360d602298632980bbd38.tar.bz2
glibc-e957308723ac2e55dad360d602298632980bbd38.zip
x86: Unifies 'strlen-evex' and 'strlen-evex512' implementations.
This commit uses a common implementation 'strlen-evex-base.S' for both 'strlen-evex' and 'strlen-evex512' The motivation is to reduce the number of implementations to maintain. This incidentally gives a small performance improvement. All tests pass on x86. Benchmarks were taken on SKX. https://www.intel.com/content/www/us/en/products/sku/123613/intel-core-i97900x-xseries-processor-13-75m-cache-up-to-4-30-ghz/specifications.html Geometric mean for strlen-evex512 over all benchmarks (N=10) was (new/old) 0.939 Geometric mean for wcslen-evex512 over all benchmarks (N=10) was (new/old) 0.965 Code Size Changes: strlen-evex512.S : +24 bytes wcslen-evex512.S : +54 bytes Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
-rw-r--r--sysdeps/x86_64/multiarch/strlen-evex-base.S380
-rw-r--r--sysdeps/x86_64/multiarch/strlen-evex.S250
-rw-r--r--sysdeps/x86_64/multiarch/strnlen-evex512.S266
-rw-r--r--sysdeps/x86_64/multiarch/wcslen-evex512.S6
-rw-r--r--sysdeps/x86_64/multiarch/wcsnlen-evex512.S9
5 files changed, 439 insertions, 472 deletions
diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
index 7305b24e28..77dc89900a 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -1,5 +1,5 @@
-/* Placeholder function, not used by any processor at the moment.
- Copyright (C) 2022-2023 Free Software Foundation, Inc.
+/* strlen/wcslen optimized with 256/512-bit EVEX instructions.
+ Copyright (C) 2021-2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,7 +16,6 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-/* UNUSED. Exists purely as reference implementation. */
#include <isa-level.h>
@@ -26,272 +25,211 @@
# ifdef USE_AS_WCSLEN
# define VPCMPEQ vpcmpeqd
+# define VPCMPNEQ vpcmpneqd
# define VPTESTN vptestnmd
+# define VPTEST vptestmd
# define VPMINU vpminud
# define CHAR_SIZE 4
+# define CHAR_SIZE_SHIFT_REG(reg) sar $2, %reg
# else
# define VPCMPEQ vpcmpeqb
+# define VPCMPNEQ vpcmpneqb
# define VPTESTN vptestnmb
+# define VPTEST vptestmb
# define VPMINU vpminub
# define CHAR_SIZE 1
+# define CHAR_SIZE_SHIFT_REG(reg)
+
+# define REG_WIDTH VEC_SIZE
# endif
-# define PAGE_SIZE 4096
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
- .section SECTION(.text),"ax",@progbits
-/* Aligning entry point to 64 byte, provides better performance for
- one vector length string. */
-ENTRY_P2ALIGN (STRLEN, 6)
-# ifdef USE_AS_STRNLEN
- /* Check zero length. */
- test %RSI_LP, %RSI_LP
- jz L(ret_max)
-# ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %esi, %esi
-# endif
+# include "reg-macros.h"
+
+# if CHAR_PER_VEC == 64
+
+# define TAIL_RETURN_LBL first_vec_x2
+# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2)
+
+# define FALLTHROUGH_RETURN_LBL first_vec_x3
+# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
+
+# else
+
+# define TAIL_RETURN_LBL first_vec_x3
+# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3)
+
+# define FALLTHROUGH_RETURN_LBL first_vec_x2
+# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
# endif
+# define XZERO VMM_128(0)
+# define VZERO VMM(0)
+# define PAGE_SIZE 4096
+
+ .section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(STRLEN, 6)
movl %edi, %eax
- vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0)
- sall $20, %eax
- cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
- ja L(page_cross)
-
- /* Compare [w]char for null, mask bit will be set for match. */
- VPCMPEQ (%rdi), %VMM(0), %k0
-# ifdef USE_AS_STRNLEN
- KMOV %k0, %VRCX
- /* Store max length in rax. */
- mov %rsi, %rax
- /* If rcx is 0, rax will have max length. We can not use VRCX
- and VRAX here for evex256 because, upper 32 bits may be
- undefined for ecx and eax. */
- bsfq %rcx, %rax
- cmp $CHAR_PER_VEC, %rax
- ja L(align_more)
- cmpq %rax, %rsi
- cmovb %esi, %eax
-# else
+ vpxorq %XZERO, %XZERO, %XZERO
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
+
+ /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
+ null byte. */
+ VPCMPEQ (%rdi), %VZERO, %k0
KMOV %k0, %VRAX
test %VRAX, %VRAX
- jz L(align_more)
+ jz L(aligned_more)
bsf %VRAX, %VRAX
-# endif
ret
- /* At this point vector max length reached. */
-# ifdef USE_AS_STRNLEN
- .p2align 4,,3
-L(ret_max):
- movq %rsi, %rax
+ .p2align 4,, 8
+L(first_vec_x4):
+ bsf %VRAX, %VRAX
+ subl %ecx, %edi
+ CHAR_SIZE_SHIFT_REG (edi)
+ leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
ret
-# endif
-L(align_more):
- mov %rdi, %rax
- /* Align rax to VEC_SIZE. */
- andq $-VEC_SIZE, %rax
-# ifdef USE_AS_STRNLEN
- movq %rdi, %rdx
- subq %rax, %rdx
-# ifdef USE_AS_WCSLEN
- shr $2, %VRDX
-# endif
- /* At this point rdx contains [w]chars already compared. */
- leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx
- /* At this point rdx contains number of w[char] needs to go.
- Now onwards rdx will keep decrementing with each compare. */
-# endif
-
- /* Loop unroll 4 times for 4 vector loop. */
- VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
- subq $-VEC_SIZE, %rax
- KMOV %k0, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x1)
-# ifdef USE_AS_STRNLEN
- subq $CHAR_PER_VEC, %rdx
- jbe L(ret_max)
-# endif
- VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
- KMOV %k0, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x2)
+ /* Aligned more for strnlen compares remaining length vs 2 *
+ CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
+ going to the loop. */
+ .p2align 4,, 10
+L(aligned_more):
+ movq %rdi, %rcx
+ andq $(VEC_SIZE * -1), %rdi
+L(cross_page_continue):
+ /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+ rechecking bounds. */
+ VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x1)
-# ifdef USE_AS_STRNLEN
- subq $CHAR_PER_VEC, %rdx
- jbe L(ret_max)
-# endif
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x2)
- VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
- KMOV %k0, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x3)
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x3)
-# ifdef USE_AS_STRNLEN
- subq $CHAR_PER_VEC, %rdx
- jbe L(ret_max)
-# endif
+ VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x4)
- VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
- KMOV %k0, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x4)
+ subq $(VEC_SIZE * -1), %rdi
-# ifdef USE_AS_STRNLEN
- subq $CHAR_PER_VEC, %rdx
- jbe L(ret_max)
- /* Save pointer before 4 x VEC_SIZE alignment. */
- movq %rax, %rcx
+# if CHAR_PER_VEC == 64
+ /* No partial register stalls on processors that we use evex512
+ on and this saves code size. */
+ xorb %dil, %dil
+# else
+ andq $-(VEC_SIZE * 4), %rdi
# endif
- /* Align address to VEC_SIZE * 4 for loop. */
- andq $-(VEC_SIZE * 4), %rax
-
-# ifdef USE_AS_STRNLEN
- subq %rax, %rcx
-# ifdef USE_AS_WCSLEN
- shr $2, %VRCX
-# endif
- /* rcx contains number of [w]char will be recompared due to
- alignment fixes. rdx must be incremented by rcx to offset
- alignment adjustment. */
- addq %rcx, %rdx
- /* Need jump as we don't want to add/subtract rdx for first
- iteration of 4 x VEC_SIZE aligned loop. */
-# endif
- .p2align 4,,11
-L(loop):
- /* VPMINU and VPCMP combination provide better performance as
- compared to alternative combinations. */
- VMOVA (VEC_SIZE * 4)(%rax), %VMM(1)
- VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
- VMOVA (VEC_SIZE * 6)(%rax), %VMM(3)
- VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
+ /* Compare 4 * VEC at a time forward. */
+ .p2align 4
+L(loop_4x_vec):
+ VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
+ VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+ VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
+ VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
VPTESTN %VMM(2), %VMM(2), %k0
- VPTESTN %VMM(4), %VMM(4), %k1
+ VPTESTN %VMM(4), %VMM(4), %k2
- subq $-(VEC_SIZE * 4), %rax
- KORTEST %k0, %k1
+ subq $-(VEC_SIZE * 4), %rdi
+ KORTEST %k0, %k2
+ jz L(loop_4x_vec)
-# ifndef USE_AS_STRNLEN
- jz L(loop)
+ VPTESTN %VMM(1), %VMM(1), %k1
+ KMOV %k1, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x0)
+
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x1)
+
+ VPTESTN %VMM(3), %VMM(3), %k0
+
+# if CHAR_PER_VEC == 64
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x2)
+ KMOV %k2, %VRAX
# else
- jnz L(loopend)
- subq $(CHAR_PER_VEC * 4), %rdx
- ja L(loop)
- mov %rsi, %rax
+ /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. */
+ kmovd %k2, %edx
+ kmovd %k0, %eax
+ salq $CHAR_PER_VEC, %rdx
+ orq %rdx, %rax
+# endif
+
+ /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. */
+ .p2align 4,, 2
+L(FALLTHROUGH_RETURN_LBL):
+ bsfq %rax, %rax
+ subq %rcx, %rdi
+ CHAR_SIZE_SHIFT_REG (rdi)
+ leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
ret
-# endif
-L(loopend):
-
- VPTESTN %VMM(1), %VMM(1), %k2
- KMOV %k2, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x1)
-
- KMOV %k0, %VRCX
- /* At this point, if k0 is non zero, null char must be in the
- second vector. */
- test %VRCX, %VRCX
- jnz L(ret_vec_x2)
-
- VPTESTN %VMM(3), %VMM(3), %k3
- KMOV %k3, %VRCX
- test %VRCX, %VRCX
- jnz L(ret_vec_x3)
- /* At this point null [w]char must be in the fourth vector so no
- need to check. */
- KMOV %k1, %VRCX
-
- /* Fourth, third, second vector terminating are pretty much
- same, implemented this way to avoid branching and reuse code
- from pre loop exit condition. */
-L(ret_vec_x4):
- bsf %VRCX, %VRCX
- subq %rdi, %rax
-# ifdef USE_AS_WCSLEN
- subq $-(VEC_SIZE * 3), %rax
- shrq $2, %rax
- addq %rcx, %rax
-# else
- leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
-# endif
-# ifdef USE_AS_STRNLEN
- cmpq %rsi, %rax
- cmovnb %rsi, %rax
-# endif
+ .p2align 4,, 8
+L(first_vec_x0):
+ bsf %VRAX, %VRAX
+ sub %rcx, %rdi
+ CHAR_SIZE_SHIFT_REG (rdi)
+ addq %rdi, %rax
ret
-L(ret_vec_x3):
- bsf %VRCX, %VRCX
- subq %rdi, %rax
-# ifdef USE_AS_WCSLEN
- subq $-(VEC_SIZE * 2), %rax
- shrq $2, %rax
- addq %rcx, %rax
-# else
- leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
-# endif
-# ifdef USE_AS_STRNLEN
- cmpq %rsi, %rax
- cmovnb %rsi, %rax
-# endif
+ .p2align 4,, 10
+L(first_vec_x1):
+ bsf %VRAX, %VRAX
+ sub %rcx, %rdi
+ CHAR_SIZE_SHIFT_REG (rdi)
+ leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
ret
-L(ret_vec_x2):
- subq $-VEC_SIZE, %rax
-L(ret_vec_x1):
- bsf %VRCX, %VRCX
- subq %rdi, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
-# endif
- addq %rcx, %rax
-# ifdef USE_AS_STRNLEN
- cmpq %rsi, %rax
- cmovnb %rsi, %rax
-# endif
+ .p2align 4,, 10
+ /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM. */
+L(TAIL_RETURN_LBL):
+ bsf %VRAX, %VRAX
+ sub %VRCX, %VRDI
+ CHAR_SIZE_SHIFT_REG (VRDI)
+ lea (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
ret
-L(page_cross):
- mov %rdi, %rax
- movl %edi, %ecx
- andl $(VEC_SIZE - 1), %ecx
+ .p2align 4,, 8
+L(cross_page_boundary):
+ movq %rdi, %rcx
+ /* Align data to VEC_SIZE. */
+ andq $-VEC_SIZE, %rdi
+
+ VPCMPEQ (%rdi), %VZERO, %k0
+
+ KMOV %k0, %VRAX
# ifdef USE_AS_WCSLEN
- sarl $2, %ecx
-# endif
- /* ecx contains number of w[char] to be skipped as a result
- of address alignment. */
- andq $-VEC_SIZE, %rax
- VPCMPEQ (%rax), %VMM(0), %k0
- KMOV %k0, %VRDX
- /* Ignore number of character for alignment adjustment. */
- shr %cl, %VRDX
-# ifdef USE_AS_STRNLEN
- jnz L(page_cross_end)
- movl $CHAR_PER_VEC, %eax
- sub %ecx, %eax
- cmp %rax, %rsi
- ja L(align_more)
+ movl %ecx, %edx
+ shrl $2, %edx
+ andl $(CHAR_PER_VEC - 1), %edx
+ shrx %edx, %eax, %eax
+ testl %eax, %eax
# else
- jz L(align_more)
-# endif
-
-L(page_cross_end):
- bsf %VRDX, %VRAX
-# ifdef USE_AS_STRNLEN
- cmpq %rsi, %rax
- cmovnb %esi, %eax
+ shr %cl, %VRAX
# endif
+ jz L(cross_page_continue)
+ bsf %VRAX, %VRAX
ret
-END (STRLEN)
+END(STRLEN)
#endif
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
index 364eeffff6..93ad15e356 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -1,245 +1,7 @@
-/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
- Copyright (C) 2021-2023 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-# include <sysdep.h>
-
-# ifndef STRLEN
-# define STRLEN __strlen_evex
-# endif
-
-# ifndef VEC_SIZE
-# include "x86-evex256-vecs.h"
-# endif
-
-# ifdef USE_AS_WCSLEN
-# define VPCMPEQ vpcmpeqd
-# define VPCMPNEQ vpcmpneqd
-# define VPTESTN vptestnmd
-# define VPTEST vptestmd
-# define VPMINU vpminud
-# define CHAR_SIZE 4
-# define CHAR_SIZE_SHIFT_REG(reg) sar $2, %reg
-# else
-# define VPCMPEQ vpcmpeqb
-# define VPCMPNEQ vpcmpneqb
-# define VPTESTN vptestnmb
-# define VPTEST vptestmb
-# define VPMINU vpminub
-# define CHAR_SIZE 1
-# define CHAR_SIZE_SHIFT_REG(reg)
-
-# define REG_WIDTH VEC_SIZE
-# endif
-
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-
-# include "reg-macros.h"
-
-# if CHAR_PER_VEC == 64
-
-# define TAIL_RETURN_LBL first_vec_x2
-# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2)
-
-# define FALLTHROUGH_RETURN_LBL first_vec_x3
-# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
-
-# else
-
-# define TAIL_RETURN_LBL first_vec_x3
-# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3)
-
-# define FALLTHROUGH_RETURN_LBL first_vec_x2
-# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
-# endif
-
-# define XZERO VMM_128(0)
-# define VZERO VMM(0)
-# define PAGE_SIZE 4096
-
- .section SECTION(.text), "ax", @progbits
-ENTRY_P2ALIGN (STRLEN, 6)
- movl %edi, %eax
- vpxorq %XZERO, %XZERO, %XZERO
- andl $(PAGE_SIZE - 1), %eax
- cmpl $(PAGE_SIZE - VEC_SIZE), %eax
- ja L(cross_page_boundary)
-
- /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
- null byte. */
- VPCMPEQ (%rdi), %VZERO, %k0
- KMOV %k0, %VRAX
- test %VRAX, %VRAX
- jz L(aligned_more)
- bsf %VRAX, %VRAX
- ret
-
- .p2align 4,, 8
-L(first_vec_x4):
- bsf %VRAX, %VRAX
- subl %ecx, %edi
- CHAR_SIZE_SHIFT_REG (edi)
- leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
- ret
-
-
-
- /* Aligned more for strnlen compares remaining length vs 2 *
- CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
- going to the loop. */
- .p2align 4,, 10
-L(aligned_more):
- movq %rdi, %rcx
- andq $(VEC_SIZE * -1), %rdi
-L(cross_page_continue):
- /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
- rechecking bounds. */
- VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0
- KMOV %k0, %VRAX
- test %VRAX, %VRAX
- jnz L(first_vec_x1)
-
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
- KMOV %k0, %VRAX
- test %VRAX, %VRAX
- jnz L(first_vec_x2)
-
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
- KMOV %k0, %VRAX
- test %VRAX, %VRAX
- jnz L(first_vec_x3)
-
- VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
- KMOV %k0, %VRAX
- test %VRAX, %VRAX
- jnz L(first_vec_x4)
-
- subq $(VEC_SIZE * -1), %rdi
-
-# if CHAR_PER_VEC == 64
- /* No partial register stalls on processors that we use evex512
- on and this saves code size. */
- xorb %dil, %dil
-# else
- andq $-(VEC_SIZE * 4), %rdi
-# endif
-
-
-
- /* Compare 4 * VEC at a time forward. */
- .p2align 4
-L(loop_4x_vec):
- VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
- VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
- VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
- VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
- VPTESTN %VMM(2), %VMM(2), %k0
- VPTESTN %VMM(4), %VMM(4), %k2
-
- subq $-(VEC_SIZE * 4), %rdi
- KORTEST %k0, %k2
- jz L(loop_4x_vec)
-
- VPTESTN %VMM(1), %VMM(1), %k1
- KMOV %k1, %VRAX
- test %VRAX, %VRAX
- jnz L(first_vec_x0)
-
- KMOV %k0, %VRAX
- test %VRAX, %VRAX
- jnz L(first_vec_x1)
-
- VPTESTN %VMM(3), %VMM(3), %k0
-
-# if CHAR_PER_VEC == 64
- KMOV %k0, %VRAX
- test %VRAX, %VRAX
- jnz L(first_vec_x2)
- KMOV %k2, %VRAX
-# else
- /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
- */
- kmovd %k2, %edx
- kmovd %k0, %eax
- salq $CHAR_PER_VEC, %rdx
- orq %rdx, %rax
-# endif
-
- /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
- */
- .p2align 4,, 2
-L(FALLTHROUGH_RETURN_LBL):
- bsfq %rax, %rax
- subq %rcx, %rdi
- CHAR_SIZE_SHIFT_REG (rdi)
- leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
- ret
-
- .p2align 4,, 8
-L(first_vec_x0):
- bsf %VRAX, %VRAX
- sub %rcx, %rdi
- CHAR_SIZE_SHIFT_REG (rdi)
- addq %rdi, %rax
- ret
-
- .p2align 4,, 10
-L(first_vec_x1):
- bsf %VRAX, %VRAX
- sub %rcx, %rdi
- CHAR_SIZE_SHIFT_REG (rdi)
- leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
- ret
-
- .p2align 4,, 10
- /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.
- */
-L(TAIL_RETURN_LBL):
- bsf %VRAX, %VRAX
- sub %VRCX, %VRDI
- CHAR_SIZE_SHIFT_REG (VRDI)
- lea (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
- ret
-
- .p2align 4,, 8
-L(cross_page_boundary):
- movq %rdi, %rcx
- /* Align data to VEC_SIZE. */
- andq $-VEC_SIZE, %rdi
-
- VPCMPEQ (%rdi), %VZERO, %k0
-
- KMOV %k0, %VRAX
-# ifdef USE_AS_WCSLEN
- movl %ecx, %edx
- shrl $2, %edx
- andl $(CHAR_PER_VEC - 1), %edx
- shrx %edx, %eax, %eax
- testl %eax, %eax
-# else
- shr %cl, %VRAX
-# endif
- jz L(cross_page_continue)
- bsf %VRAX, %VRAX
- ret
-
-END (STRLEN)
+#ifndef STRLEN
+# define STRLEN __strlen_evex
#endif
+
+#include "x86-evex256-vecs.h"
+#include "reg-macros.h"
+#include "strlen-evex-base.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
index 0b7f220214..ebf22c259f 100644
--- a/sysdeps/x86_64/multiarch/strnlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
@@ -1,4 +1,264 @@
-#define STRLEN __strnlen_evex512
-#define USE_AS_STRNLEN 1
+/* Placeholder function, not used by any processor at the moment.
+ Copyright (C) 2022-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
-#include "strlen-evex512.S"
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRNLEN
+#define STRNLEN __strnlen_evex512
+#endif
+
+#include "x86-evex512-vecs.h"
+#include "reg-macros.h"
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# include <sysdep.h>
+
+# ifdef USE_AS_WCSLEN
+# define VPCMPEQ vpcmpeqd
+# define VPTESTN vptestnmd
+# define VPMINU vpminud
+# define CHAR_SIZE 4
+# else
+# define VPCMPEQ vpcmpeqb
+# define VPTESTN vptestnmb
+# define VPMINU vpminub
+# define CHAR_SIZE 1
+# endif
+
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+
+ .section SECTION(.text),"ax",@progbits
+/* Aligning entry point to 64 byte, provides better performance for
+ one vector length string. */
+ENTRY_P2ALIGN (STRNLEN, 6)
+ /* Check zero length. */
+ test %RSI_LP, %RSI_LP
+ jz L(ret_max)
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %esi, %esi
+# endif
+
+ movl %edi, %eax
+ vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0)
+ sall $20, %eax
+ cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
+ ja L(page_cross)
+
+ /* Compare [w]char for null, mask bit will be set for match. */
+ VPCMPEQ (%rdi), %VMM(0), %k0
+ KMOV %k0, %VRCX
+ /* Store max length in rax. */
+ mov %rsi, %rax
+ /* If rcx is 0, rax will have max length. We can not use VRCX
+ and VRAX here for evex256 because, upper 32 bits may be
+ undefined for ecx and eax. */
+ bsfq %rcx, %rax
+ cmp $CHAR_PER_VEC, %rax
+ ja L(align_more)
+ cmpq %rax, %rsi
+ cmovb %esi, %eax
+ ret
+
+ /* At this point vector max length reached. */
+ .p2align 4,,3
+L(ret_max):
+ movq %rsi, %rax
+ ret
+
+L(align_more):
+ mov %rdi, %rax
+ /* Align rax to VEC_SIZE. */
+ andq $-VEC_SIZE, %rax
+ movq %rdi, %rdx
+ subq %rax, %rdx
+# ifdef USE_AS_WCSLEN
+ shr $2, %VRDX
+# endif
+ /* At this point rdx contains [w]chars already compared. */
+ leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx
+ /* At this point rdx contains number of w[char] needs to go.
+ Now onwards rdx will keep decrementing with each compare. */
+
+ /* Loop unroll 4 times for 4 vector loop. */
+ VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
+ subq $-VEC_SIZE, %rax
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x1)
+
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+
+ VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x2)
+
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+
+ VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x3)
+
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+
+ VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x4)
+
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+ /* Save pointer before 4 x VEC_SIZE alignment. */
+ movq %rax, %rcx
+
+ /* Align address to VEC_SIZE * 4 for loop. */
+ andq $-(VEC_SIZE * 4), %rax
+
+ subq %rax, %rcx
+# ifdef USE_AS_WCSLEN
+ shr $2, %VRCX
+# endif
+ /* rcx contains number of [w]char will be recompared due to
+ alignment fixes. rdx must be incremented by rcx to offset
+ alignment adjustment. */
+ addq %rcx, %rdx
+ /* Need jump as we don't want to add/subtract rdx for first
+ iteration of 4 x VEC_SIZE aligned loop. */
+
+ .p2align 4,,11
+L(loop):
+ /* VPMINU and VPCMP combination provide better performance as
+ compared to alternative combinations. */
+ VMOVA (VEC_SIZE * 4)(%rax), %VMM(1)
+ VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
+ VMOVA (VEC_SIZE * 6)(%rax), %VMM(3)
+ VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
+
+ VPTESTN %VMM(2), %VMM(2), %k0
+ VPTESTN %VMM(4), %VMM(4), %k1
+
+ subq $-(VEC_SIZE * 4), %rax
+ KORTEST %k0, %k1
+
+ jnz L(loopend)
+ subq $(CHAR_PER_VEC * 4), %rdx
+ ja L(loop)
+ mov %rsi, %rax
+ ret
+
+L(loopend):
+
+ VPTESTN %VMM(1), %VMM(1), %k2
+ KMOV %k2, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x1)
+
+ KMOV %k0, %VRCX
+ /* At this point, if k0 is non zero, null char must be in the
+ second vector. */
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x2)
+
+ VPTESTN %VMM(3), %VMM(3), %k3
+ KMOV %k3, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x3)
+ /* At this point null [w]char must be in the fourth vector so no
+ need to check. */
+ KMOV %k1, %VRCX
+
+ /* Fourth, third, second vector terminating are pretty much
+ same, implemented this way to avoid branching and reuse code
+ from pre loop exit condition. */
+L(ret_vec_x4):
+ bsf %VRCX, %VRCX
+ subq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ subq $-(VEC_SIZE * 3), %rax
+ shrq $2, %rax
+ addq %rcx, %rax
+# else
+ leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
+# endif
+
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+ ret
+
+L(ret_vec_x3):
+ bsf %VRCX, %VRCX
+ subq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ subq $-(VEC_SIZE * 2), %rax
+ shrq $2, %rax
+ addq %rcx, %rax
+# else
+ leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
+# endif
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+ ret
+
+L(ret_vec_x2):
+ subq $-VEC_SIZE, %rax
+L(ret_vec_x1):
+ bsf %VRCX, %VRCX
+ subq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ addq %rcx, %rax
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+ ret
+
+L(page_cross):
+ mov %rdi, %rax
+ movl %edi, %ecx
+ andl $(VEC_SIZE - 1), %ecx
+# ifdef USE_AS_WCSLEN
+ sarl $2, %ecx
+# endif
+ /* ecx contains number of w[char] to be skipped as a result
+ of address alignment. */
+ andq $-VEC_SIZE, %rax
+ VPCMPEQ (%rax), %VMM(0), %k0
+ KMOV %k0, %VRDX
+ /* Ignore number of character for alignment adjustment. */
+ shr %cl, %VRDX
+ jnz L(page_cross_end)
+ movl $CHAR_PER_VEC, %eax
+ sub %ecx, %eax
+ cmp %rax, %rsi
+ ja L(align_more)
+
+L(page_cross_end):
+ bsf %VRDX, %VRAX
+ cmpq %rsi, %rax
+ cmovnb %esi, %eax
+ ret
+
+END (STRNLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
index f59c372b78..aff288a66b 100644
--- a/sysdeps/x86_64/multiarch/wcslen-evex512.S
+++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
@@ -1,4 +1,8 @@
-#define STRLEN __wcslen_evex512
+#ifndef WCSLEN
+# define WCSLEN __wcslen_evex512
+#endif
+
+#define STRLEN WCSLEN
#define USE_AS_WCSLEN 1
#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
index 73dcf2f210..1c37d74fc9 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
@@ -1,5 +1,8 @@
-#define STRLEN __wcsnlen_evex512
+#ifndef WCSNLEN
+# define WCSNLEN __wcsnlen_evex512
+#endif
+
+#define STRNLEN WCSNLEN
#define USE_AS_WCSLEN 1
-#define USE_AS_STRNLEN 1
-#include "strlen-evex512.S"
+#include "strnlen-evex512.S"