aboutsummaryrefslogtreecommitdiff
path: root/sysdeps
diff options
context:
space:
mode:
authorNoah Goldstein <goldstein.w.n@gmail.com>2022-11-08 17:38:38 -0800
committerNoah Goldstein <goldstein.w.n@gmail.com>2022-11-08 19:22:33 -0800
commitf049f52dfeed8129c11ab1641a815705d09ff7e8 (patch)
treea6c13dc462411b308467b26a3a0f1062e0597bbd /sysdeps
parentd44e116428fefa0c2d01151af11f7a41fb525536 (diff)
downloadglibc-f049f52dfeed8129c11ab1641a815705d09ff7e8.tar
glibc-f049f52dfeed8129c11ab1641a815705d09ff7e8.tar.gz
glibc-f049f52dfeed8129c11ab1641a815705d09ff7e8.tar.bz2
glibc-f049f52dfeed8129c11ab1641a815705d09ff7e8.zip
x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions
Optimizations are: 1. Use more overlapping stores to avoid branches. 2. Reduce how unrolled the aligning copies are (this is more of a code-size save, its a negative for some sizes in terms of perf). 3. Improve the loop a bit (similiar to what we do in strlen with 2x vpminu + kortest instead of 3x vpminu + kmov + test). 4. For st{r|p}n{cat|cpy} re-order the branches to minimize the number that are taken. Performance Changes: Times are from N = 10 runs of the benchmark suite and are reported as geometric mean of all ratios of New Implementation / Old Implementation. stpcpy-evex -> 0.922 strcat-evex -> 0.985 strcpy-evex -> 0.880 strncpy-evex -> 0.831 stpncpy-evex -> 0.780 strncat-evex -> 0.958 Code Size Changes: function -> Bytes New / Bytes Old -> Ratio strcat-evex -> 819 / 1874 -> 0.437 strcpy-evex -> 700 / 1074 -> 0.652 stpcpy-evex -> 735 / 1094 -> 0.672 strncpy-evex -> 1397 / 2611 -> 0.535 stpncpy-evex -> 1489 / 2691 -> 0.553 strncat-evex -> 1184 / 2832 -> 0.418 Notes: 1. Because of the significant difference between the implementations they are split into three files. strcpy-evex.S -> strcpy, stpcpy, strcat strncpy-evex.S -> strncpy strncat-evex.S > strncat I couldn't find a way to merge them without making the ifdefs incredibly difficult to follow. 2. All implementations can be made evex512 by including "x86-evex512-vecs.h" at the top. 3. All implementations have an optional define: `USE_EVEX_MASKED_STORE` Setting to one uses evex-masked stores for handling short strings. This saves code size and branches. It's disabled for all implementations are the moment as there are some serious drawbacks to masked stores in certain cases, but that may be fixed on future architectures. Full check passes on x86-64 and build succeeds for all ISA levels w/ and w/o multiarch.
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-evex.S5
-rw-r--r--sysdeps/x86_64/multiarch/strcat-evex.S291
-rw-r--r--sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S110
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-evex.S1282
-rw-r--r--sysdeps/x86_64/multiarch/strncat-evex.S525
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-evex.S995
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h80
7 files changed, 2115 insertions, 1173 deletions
diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
index 99ea76a372..3693491baa 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-evex.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
@@ -3,6 +3,5 @@
#endif
#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY STPNCPY
-#include "strcpy-evex.S"
+#define STRNCPY STPNCPY
+#include "strncpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
index 0e2df947e9..b4207b7889 100644
--- a/sysdeps/x86_64/multiarch/strcat-evex.S
+++ b/sysdeps/x86_64/multiarch/strcat-evex.S
@@ -1,286 +1,7 @@
-/* strcat with 256-bit EVEX instructions.
- Copyright (C) 2021-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-# define STRCAT __strcat_evex
-# endif
-
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
-
-/* zero register */
-# define XMMZERO xmm16
-# define YMMZERO ymm16
-# define YMM0 ymm17
-# define YMM1 ymm18
-
-# define USE_AS_STRCAT
-
-/* Number of bytes in a vector register */
-# define VEC_SIZE 32
-
- .section .text.evex,"ax",@progbits
-ENTRY (STRCAT)
- mov %rdi, %r9
-# ifdef USE_AS_STRNCAT
- mov %rdx, %r8
-# endif
-
- xor %eax, %eax
- mov %edi, %ecx
- and $((VEC_SIZE * 4) - 1), %ecx
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
- cmp $(VEC_SIZE * 3), %ecx
- ja L(fourth_vector_boundary)
- vpcmpb $0, (%rdi), %YMMZERO, %k0
- kmovd %k0, %edx
- test %edx, %edx
- jnz L(exit_null_on_first_vector)
- mov %rdi, %rax
- and $-VEC_SIZE, %rax
- jmp L(align_vec_size_start)
-L(fourth_vector_boundary):
- mov %rdi, %rax
- and $-VEC_SIZE, %rax
- vpcmpb $0, (%rax), %YMMZERO, %k0
- mov $-1, %r10d
- sub %rax, %rcx
- shl %cl, %r10d
- kmovd %k0, %edx
- and %r10d, %edx
- jnz L(exit)
-
-L(align_vec_size_start):
- vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
- kmovd %k0, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
- kmovd %k1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
- kmovd %k2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
- kmovd %k3, %edx
- test %edx, %edx
- jnz L(exit_null_on_fifth_vector)
-
- vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
- add $(VEC_SIZE * 4), %rax
- kmovd %k4, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
- kmovd %k1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
- kmovd %k2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
- kmovd %k3, %edx
- test %edx, %edx
- jnz L(exit_null_on_fifth_vector)
-
- vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
- kmovd %k4, %edx
- add $(VEC_SIZE * 4), %rax
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
- kmovd %k1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
- kmovd %k2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
- kmovd %k3, %edx
- test %edx, %edx
- jnz L(exit_null_on_fifth_vector)
-
- vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
- add $(VEC_SIZE * 4), %rax
- kmovd %k4, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
- kmovd %k1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
- kmovd %k2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
- kmovd %k3, %edx
- test %edx, %edx
- jnz L(exit_null_on_fifth_vector)
-
- test $((VEC_SIZE * 4) - 1), %rax
- jz L(align_four_vec_loop)
-
- vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
- add $(VEC_SIZE * 5), %rax
- kmovd %k4, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $((VEC_SIZE * 4) - 1), %rax
- jz L(align_four_vec_loop)
-
- vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
- add $VEC_SIZE, %rax
- kmovd %k0, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $((VEC_SIZE * 4) - 1), %rax
- jz L(align_four_vec_loop)
-
- vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
- add $VEC_SIZE, %rax
- kmovd %k0, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $((VEC_SIZE * 4) - 1), %rax
- jz L(align_four_vec_loop)
-
- vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k1
- add $VEC_SIZE, %rax
- kmovd %k1, %edx
- test %edx, %edx
- jnz L(exit)
-
- add $VEC_SIZE, %rax
-
- .p2align 4
-L(align_four_vec_loop):
- VMOVA (%rax), %YMM0
- VMOVA (VEC_SIZE * 2)(%rax), %YMM1
- vpminub VEC_SIZE(%rax), %YMM0, %YMM0
- vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1
- vpminub %YMM0, %YMM1, %YMM0
- /* If K0 != 0, there is a null byte. */
- vpcmpb $0, %YMM0, %YMMZERO, %k0
- add $(VEC_SIZE * 4), %rax
- ktestd %k0, %k0
- jz L(align_four_vec_loop)
-
- vpcmpb $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
- sub $(VEC_SIZE * 5), %rax
- kmovd %k0, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
- kmovd %k1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
- kmovd %k2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
- kmovd %k3, %edx
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $(VEC_SIZE * 4), %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit):
- sub %rdi, %rax
-L(exit_null_on_first_vector):
- bsf %rdx, %rdx
- add %rdx, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_null_on_second_vector):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $VEC_SIZE, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_null_on_third_vector):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $(VEC_SIZE * 2), %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_null_on_fourth_vector):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $(VEC_SIZE * 3), %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_null_on_fifth_vector):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $(VEC_SIZE * 4), %rax
-
- .p2align 4
-L(StartStrcpyPart):
- lea (%r9, %rax), %rdi
- mov %rsi, %rcx
- mov %r9, %rax /* save result */
-
-# ifdef USE_AS_STRNCAT
- test %r8, %r8
- jz L(ExitZero)
-# define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-evex.S"
+#ifndef STRCAT
+# define STRCAT __strcat_evex
#endif
+
+#define USE_AS_STRCAT
+#define STRCPY STRCAT
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S
new file mode 100644
index 0000000000..9530d7b683
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S
@@ -0,0 +1,110 @@
+/* strlen used for begining of str{n}cat using EVEX 256/512.
+ Copyright (C) 2011-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+/* NOTE: This file is meant to be included by strcat-evex or
+ strncat-evex and does not standalone. Before including %rdi
+ must be saved in %rax. */
+
+
+/* Simple strlen implementation that ends at
+ L(strcat_strlen_done). */
+ vpxorq %VZERO_128, %VZERO_128, %VZERO_128
+ movq %rdi, %r8
+ andq $(VEC_SIZE * -1), %r8
+ VPCMPEQ (%r8), %VZERO, %k0
+ KMOV %k0, %VRCX
+#ifdef USE_AS_WCSCPY
+ subl %r8d, %edi
+ shrl $2, %edi
+#endif
+ shrx %VRDI, %VRCX, %VRCX
+#ifdef USE_AS_WCSCPY
+ movq %rax, %rdi
+#endif
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v0)
+
+
+ VPCMPEQ VEC_SIZE(%r8), %VZERO, %k0
+ KMOV %k0, %VRCX
+ leaq (VEC_SIZE)(%r8), %rdi
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v0)
+
+ VPCMPEQ (VEC_SIZE * 2)(%r8), %VZERO, %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v1)
+
+ VPCMPEQ (VEC_SIZE * 3)(%r8), %VZERO, %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v2)
+
+ VPCMPEQ (VEC_SIZE * 4)(%r8), %VZERO, %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v3)
+
+ andq $-(VEC_SIZE * 4), %rdi
+ .p2align 4,, 8
+L(loop_2x_vec):
+ VMOVA (VEC_SIZE * 4)(%rdi), %VMM(0)
+ VPMIN (VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1)
+ VMOVA (VEC_SIZE * 6)(%rdi), %VMM(2)
+ VPMIN (VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3)
+ VPTESTN %VMM(1), %VMM(1), %k1
+ VPTESTN %VMM(3), %VMM(3), %k3
+ subq $(VEC_SIZE * -4), %rdi
+ KORTEST %k1, %k3
+ jz L(loop_2x_vec)
+
+ VPTESTN %VMM(0), %VMM(0), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v0)
+
+ KMOV %k1, %VRCX
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v1)
+
+ VPTESTN %VMM(2), %VMM(2), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v2)
+
+ KMOV %k3, %VRCX
+L(bsf_and_done_v3):
+ addq $VEC_SIZE, %rdi
+L(bsf_and_done_v2):
+ bsf %VRCX, %VRCX
+ leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi
+ jmp L(strcat_strlen_done)
+
+ .p2align 4,, 4
+L(bsf_and_done_v1):
+ addq $VEC_SIZE, %rdi
+L(bsf_and_done_v0):
+ bsf %VRCX, %VRCX
+#ifdef USE_AS_WCSCPY
+ leaq (%rdi, %rcx, CHAR_SIZE), %rdi
+#else
+ addq %rcx, %rdi
+#endif
+L(strcat_strlen_done):
diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
index 82e45ac675..932129ab40 100644
--- a/sysdeps/x86_64/multiarch/strcpy-evex.S
+++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
@@ -1,4 +1,4 @@
-/* strcpy with 256-bit EVEX instructions.
+/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions.
Copyright (C) 2021-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -17,990 +17,526 @@
<https://www.gnu.org/licenses/>. */
#include <isa-level.h>
-
#if ISA_SHOULD_BUILD (4)
-# ifndef USE_AS_STRCAT
-# include <sysdep.h>
+ /* Use evex-masked stores for small sizes. Turned off at the
+ moment. */
+# define USE_EVEX_MASKED_STORE 0
+ /* Use movsb in page cross case to save code size. */
+# define USE_MOVSB_IN_PAGE_CROSS 1
-# ifndef STRCPY
-# define STRCPY __strcpy_evex
-# endif
+# include <sysdep.h>
+# ifndef VEC_SIZE
+# include "x86-evex256-vecs.h"
# endif
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
-
-/* Number of bytes in a vector register */
-# ifndef VEC_SIZE
-# define VEC_SIZE 32
+# ifndef STRCPY
+# define STRCPY __strcpy_evex
# endif
-# define XMM2 xmm18
-# define XMM3 xmm19
-# define YMM2 ymm18
-# define YMM3 ymm19
-# define YMM4 ymm20
-# define YMM5 ymm21
-# define YMM6 ymm22
-# define YMM7 ymm23
+# ifdef USE_AS_WCSCPY
+# define VMOVU_MASK vmovdqu32
+# define VPMIN vpminud
+# define VPTESTN vptestnmd
+# define VPTEST vptestmd
+# define VPCMPEQ vpcmpeqd
+# define CHAR_SIZE 4
-# ifndef USE_AS_STRCAT
+# define REP_MOVS rep movsd
-/* zero register */
-# define XMMZERO xmm16
-# define YMMZERO ymm16
-# define YMM1 ymm17
-
- .section .text.evex,"ax",@progbits
-ENTRY (STRCPY)
-# ifdef USE_AS_STRNCPY
- mov %RDX_LP, %R8_LP
- test %R8_LP, %R8_LP
- jz L(ExitZero)
-# endif
- mov %rsi, %rcx
-# ifndef USE_AS_STPCPY
- mov %rdi, %rax /* save result */
-# endif
+# define USE_WIDE_CHAR
+# else
+# define VMOVU_MASK vmovdqu8
+# define VPMIN vpminub
+# define VPTESTN vptestnmb
+# define VPTEST vptestmb
+# define VPCMPEQ vpcmpeqb
+# define CHAR_SIZE 1
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
+# define REP_MOVS rep movsb
# endif
- and $((VEC_SIZE * 4) - 1), %ecx
- cmp $(VEC_SIZE * 2), %ecx
- jbe L(SourceStringAlignmentLessTwoVecSize)
-
- and $-VEC_SIZE, %rsi
- and $(VEC_SIZE - 1), %ecx
-
- vpcmpb $0, (%rsi), %YMMZERO, %k0
- kmovd %k0, %edx
- shr %cl, %rdx
+# include "reg-macros.h"
-# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
- mov $VEC_SIZE, %r10
- sub %rcx, %r10
- cmp %r10, %r8
-# else
- mov $(VEC_SIZE + 1), %r10
- sub %rcx, %r10
- cmp %r10, %r8
-# endif
- jbe L(CopyVecSizeTailCase2OrCase3)
-# endif
- test %edx, %edx
- jnz L(CopyVecSizeTail)
-
- vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1
- kmovd %k1, %edx
-# ifdef USE_AS_STRNCPY
- add $VEC_SIZE, %r10
- cmp %r10, %r8
- jbe L(CopyTwoVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
- jnz L(CopyTwoVecSize)
-
- VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */
- VMOVU %YMM2, (%rdi)
-
-/* If source address alignment != destination address alignment */
- .p2align 4
-L(UnalignVecSizeBoth):
- sub %rcx, %rdi
-# ifdef USE_AS_STRNCPY
- add %rcx, %r8
- sbb %rcx, %rcx
- or %rcx, %r8
-# endif
- mov $VEC_SIZE, %rcx
- VMOVA (%rsi, %rcx), %YMM2
- VMOVU %YMM2, (%rdi, %rcx)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM2
- vpcmpb $0, %YMM2, %YMMZERO, %k0
- kmovd %k0, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $(VEC_SIZE * 3), %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_STPCPY
+# define END_REG rax
# else
- jnz L(CopyVecSize)
+# define END_REG rdi, %rdx, CHAR_SIZE
# endif
- VMOVU %YMM2, (%rdi, %rcx)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM3
- vpcmpb $0, %YMM3, %YMMZERO, %k0
- kmovd %k0, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec3)
+# ifdef USE_AS_STRCAT
+# define PAGE_ALIGN_REG edx
+# define PAGE_ALIGN_REG_64 rdx
# else
- jnz L(CopyVecSize)
+# define PAGE_ALIGN_REG eax
+# define PAGE_ALIGN_REG_64 rax
# endif
- VMOVU %YMM3, (%rdi, %rcx)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM4
- vpcmpb $0, %YMM4, %YMMZERO, %k0
- kmovd %k0, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec4)
-# else
- jnz L(CopyVecSize)
-# endif
+# define VZERO VMM(7)
+# define VZERO_128 VMM_128(7)
- VMOVU %YMM4, (%rdi, %rcx)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM2
- vpcmpb $0, %YMM2, %YMMZERO, %k0
- kmovd %k0, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec2)
-# else
- jnz L(CopyVecSize)
-# endif
- VMOVU %YMM2, (%rdi, %rcx)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM2
- vpcmpb $0, %YMM2, %YMMZERO, %k0
- kmovd %k0, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec2)
-# else
- jnz L(CopyVecSize)
-# endif
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM3
- VMOVU %YMM2, (%rdi, %rcx)
- vpcmpb $0, %YMM3, %YMMZERO, %k0
- kmovd %k0, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec3)
-# else
- jnz L(CopyVecSize)
-# endif
- VMOVU %YMM3, (%rdi, %rcx)
- mov %rsi, %rdx
- lea VEC_SIZE(%rsi, %rcx), %rsi
- and $-(VEC_SIZE * 4), %rsi
- sub %rsi, %rdx
- sub %rdx, %rdi
-# ifdef USE_AS_STRNCPY
- lea (VEC_SIZE * 8)(%r8, %rdx), %r8
-# endif
-L(UnalignedFourVecSizeLoop):
- VMOVA (%rsi), %YMM4
- VMOVA VEC_SIZE(%rsi), %YMM5
- VMOVA (VEC_SIZE * 2)(%rsi), %YMM6
- VMOVA (VEC_SIZE * 3)(%rsi), %YMM7
- vpminub %YMM5, %YMM4, %YMM2
- vpminub %YMM7, %YMM6, %YMM3
- vpminub %YMM2, %YMM3, %YMM2
- /* If K7 != 0, there is a null byte. */
- vpcmpb $0, %YMM2, %YMMZERO, %k7
- kmovd %k7, %edx
-# ifdef USE_AS_STRNCPY
- sub $(VEC_SIZE * 4), %r8
- jbe L(UnalignedLeaveCase2OrCase3)
+ .section SECTION(.text), "ax", @progbits
+ENTRY(STRCPY)
+# ifdef USE_AS_STRCAT
+ movq %rdi, %rax
+# include "strcat-strlen-evex.h.S"
# endif
- test %edx, %edx
- jnz L(UnalignedFourVecSizeLeave)
-
-L(UnalignedFourVecSizeLoop_start):
- add $(VEC_SIZE * 4), %rdi
- add $(VEC_SIZE * 4), %rsi
- VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi)
- VMOVA (%rsi), %YMM4
- VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi)
- VMOVA VEC_SIZE(%rsi), %YMM5
- vpminub %YMM5, %YMM4, %YMM2
- VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi)
- VMOVA (VEC_SIZE * 2)(%rsi), %YMM6
- VMOVU %YMM7, -VEC_SIZE(%rdi)
- VMOVA (VEC_SIZE * 3)(%rsi), %YMM7
- vpminub %YMM7, %YMM6, %YMM3
- vpminub %YMM2, %YMM3, %YMM2
- /* If K7 != 0, there is a null byte. */
- vpcmpb $0, %YMM2, %YMMZERO, %k7
- kmovd %k7, %edx
-# ifdef USE_AS_STRNCPY
- sub $(VEC_SIZE * 4), %r8
- jbe L(UnalignedLeaveCase2OrCase3)
+
+ movl %esi, %PAGE_ALIGN_REG
+ andl $(PAGE_SIZE - 1), %PAGE_ALIGN_REG
+ cmpl $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
+ ja L(page_cross)
+L(page_cross_continue):
+ VMOVU (%rsi), %VMM(0)
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+ movq %rdi, %rax
# endif
- test %edx, %edx
- jz L(UnalignedFourVecSizeLoop_start)
-L(UnalignedFourVecSizeLeave):
- vpcmpb $0, %YMM4, %YMMZERO, %k1
- kmovd %k1, %edx
- test %edx, %edx
- jnz L(CopyVecSizeUnaligned_0)
- vpcmpb $0, %YMM5, %YMMZERO, %k2
- kmovd %k2, %ecx
- test %ecx, %ecx
- jnz L(CopyVecSizeUnaligned_16)
+ /* Two short string implementations. One with traditional
+ branching approach and one with masked instructions (which
+ have potential for dramatically bad perf if dst splits a
+ page and is not in the TLB). */
+# if USE_EVEX_MASKED_STORE
+ VPTEST %VMM(0), %VMM(0), %k0
+ KMOV %k0, %VRCX
+# ifdef USE_AS_WCSCPY
+ subl $((1 << CHAR_PER_VEC)- 1), %VRCX
+# else
+ inc %VRCX
+# endif
+ jz L(more_1x_vec)
+ KMOV %VRCX, %k1
+ KXOR %k0, %k1, %k1
- vpcmpb $0, %YMM6, %YMMZERO, %k3
- kmovd %k3, %edx
- test %edx, %edx
- jnz L(CopyVecSizeUnaligned_32)
-
- vpcmpb $0, %YMM7, %YMMZERO, %k4
- kmovd %k4, %ecx
- bsf %ecx, %edx
- VMOVU %YMM4, (%rdi)
- VMOVU %YMM5, VEC_SIZE(%rdi)
- VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea (VEC_SIZE * 3)(%rdi, %rdx), %rax
-# endif
- VMOVU %YMM7, (VEC_SIZE * 3)(%rdi)
- add $(VEC_SIZE - 1), %r8
- sub %rdx, %r8
- lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- add $(VEC_SIZE * 3), %rsi
- add $(VEC_SIZE * 3), %rdi
- jmp L(CopyVecSizeExit)
-# endif
+ VMOVU_MASK %VMM(0), (%rdi){%k1}
-/* If source address alignment == destination address alignment */
+# ifdef USE_AS_STPCPY
+ bsf %VRCX, %VRCX
+ leaq (%rdi, %rcx, CHAR_SIZE), %rax
+# endif
+ ret
-L(SourceStringAlignmentLessTwoVecSize):
- VMOVU (%rsi), %YMM3
- VMOVU VEC_SIZE(%rsi), %YMM2
- vpcmpb $0, %YMM3, %YMMZERO, %k0
- kmovd %k0, %edx
+# else
+ VPTESTN %VMM(0), %VMM(0), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jz L(more_1x_vec)
-# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
- cmp $VEC_SIZE, %r8
+ xorl %edx, %edx
+ bsf %VRCX, %VRDX
+# ifdef USE_AS_STPCPY
+ leaq (%rdi, %rdx, CHAR_SIZE), %rax
+# endif
+
+ /* Use mask bits in rcx to detect which copy we need. If the low
+ mask is zero then there must be a bit set in the upper half.
+ I.e if rcx != 0 and ecx == 0, then match must be upper 32
+ bits so we use L(copy_32_63). */
+# if VEC_SIZE == 64
+# ifdef USE_AS_WCSCPY
+ testb %cl, %cl
+# else
+ testl %ecx, %ecx
+# endif
+ jz L(copy_32_63)
+# endif
+
+# ifdef USE_AS_WCSCPY
+ testb $0xf, %cl
# else
- cmp $(VEC_SIZE + 1), %r8
+ testw %cx, %cx
# endif
- jbe L(CopyVecSizeTail1Case2OrCase3)
-# endif
- test %edx, %edx
- jnz L(CopyVecSizeTail1)
+ jz L(copy_16_31)
- VMOVU %YMM3, (%rdi)
- vpcmpb $0, %YMM2, %YMMZERO, %k0
- kmovd %k0, %edx
-# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
- cmp $(VEC_SIZE * 2), %r8
+# ifdef USE_AS_WCSCPY
+ testb $0x3, %cl
# else
- cmp $((VEC_SIZE * 2) + 1), %r8
+ testb %cl, %cl
# endif
- jbe L(CopyTwoVecSize1Case2OrCase3)
-# endif
- test %edx, %edx
- jnz L(CopyTwoVecSize1)
-
- and $-VEC_SIZE, %rsi
- and $(VEC_SIZE - 1), %ecx
- jmp L(UnalignVecSizeBoth)
+ jz L(copy_8_15)
-/*------End of main part with loops---------------------*/
-/* Case1 */
+# ifdef USE_AS_WCSCPY
+ vmovd %VMM_128(0), (%rdi)
+ /* No need to copy, we know its zero. */
+ movl $0, (%END_REG)
-# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
- .p2align 4
-L(CopyVecSize):
- add %rcx, %rdi
-# endif
-L(CopyVecSizeTail):
- add %rcx, %rsi
-L(CopyVecSizeTail1):
- bsf %edx, %edx
-L(CopyVecSizeExit):
- cmp $32, %edx
- jae L(Exit32_63)
- cmp $16, %edx
- jae L(Exit16_31)
- cmp $8, %edx
- jae L(Exit8_15)
- cmp $4, %edx
- jae L(Exit4_7)
- cmp $3, %edx
- je L(Exit3)
- cmp $1, %edx
- ja L(Exit2)
- je L(Exit1)
- movb $0, (%rdi)
-# ifdef USE_AS_STPCPY
- lea (%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $1, %r8
- lea 1(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
ret
+# else
- .p2align 4
-L(CopyTwoVecSize1):
- add $VEC_SIZE, %rsi
- add $VEC_SIZE, %rdi
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $VEC_SIZE, %r8
-# endif
- jmp L(CopyVecSizeTail1)
-
- .p2align 4
-L(CopyTwoVecSize):
- bsf %edx, %edx
- add %rcx, %rsi
- add $VEC_SIZE, %edx
- sub %ecx, %edx
- jmp L(CopyVecSizeExit)
-
- .p2align 4
-L(CopyVecSizeUnaligned_0):
- bsf %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
- VMOVU %YMM4, (%rdi)
- add $((VEC_SIZE * 4) - 1), %r8
- sub %rdx, %r8
- lea 1(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- jmp L(CopyVecSizeExit)
-# endif
+ testb $0x7, %cl
+ jz L(copy_4_7)
- .p2align 4
-L(CopyVecSizeUnaligned_16):
- bsf %ecx, %edx
- VMOVU %YMM4, (%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea VEC_SIZE(%rdi, %rdx), %rax
-# endif
- VMOVU %YMM5, VEC_SIZE(%rdi)
- add $((VEC_SIZE * 3) - 1), %r8
- sub %rdx, %r8
- lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- add $VEC_SIZE, %rsi
- add $VEC_SIZE, %rdi
- jmp L(CopyVecSizeExit)
-# endif
- .p2align 4
-L(CopyVecSizeUnaligned_32):
- bsf %edx, %edx
- VMOVU %YMM4, (%rdi)
- VMOVU %YMM5, VEC_SIZE(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea (VEC_SIZE * 2)(%rdi, %rdx), %rax
-# endif
- VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
- add $((VEC_SIZE * 2) - 1), %r8
- sub %rdx, %r8
- lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- add $(VEC_SIZE * 2), %rsi
- add $(VEC_SIZE * 2), %rdi
- jmp L(CopyVecSizeExit)
-# endif
+ test %edx, %edx
+ jz L(set_null_term)
-# ifdef USE_AS_STRNCPY
-# ifndef USE_AS_STRCAT
- .p2align 4
-L(CopyVecSizeUnalignedVec6):
- VMOVU %YMM6, (%rdi, %rcx)
- jmp L(CopyVecSizeVecExit)
-
- .p2align 4
-L(CopyVecSizeUnalignedVec5):
- VMOVU %YMM5, (%rdi, %rcx)
- jmp L(CopyVecSizeVecExit)
-
- .p2align 4
-L(CopyVecSizeUnalignedVec4):
- VMOVU %YMM4, (%rdi, %rcx)
- jmp L(CopyVecSizeVecExit)
-
- .p2align 4
-L(CopyVecSizeUnalignedVec3):
- VMOVU %YMM3, (%rdi, %rcx)
- jmp L(CopyVecSizeVecExit)
+ /* NB: make this `vmovw` if support for AVX512-FP16 is added.
+ */
+ vmovd %VMM_128(0), %esi
+ movw %si, (%rdi)
+
+ .p2align 4,, 1
+L(set_null_term):
+ /* No need to copy, we know its zero. */
+ movb $0, (%END_REG)
+ ret
# endif
-/* Case2 */
-
- .p2align 4
-L(CopyVecSizeCase2):
- add $VEC_SIZE, %r8
- add %rcx, %rdi
- add %rcx, %rsi
- bsf %edx, %edx
- cmp %r8d, %edx
- jb L(CopyVecSizeExit)
- jmp L(StrncpyExit)
-
- .p2align 4
-L(CopyTwoVecSizeCase2):
- add %rcx, %rsi
- bsf %edx, %edx
- add $VEC_SIZE, %edx
- sub %ecx, %edx
- cmp %r8d, %edx
- jb L(CopyVecSizeExit)
- jmp L(StrncpyExit)
-
-L(CopyVecSizeTailCase2):
- add %rcx, %rsi
- bsf %edx, %edx
- cmp %r8d, %edx
- jb L(CopyVecSizeExit)
- jmp L(StrncpyExit)
-
-L(CopyVecSizeTail1Case2):
- bsf %edx, %edx
- cmp %r8d, %edx
- jb L(CopyVecSizeExit)
- jmp L(StrncpyExit)
-
-/* Case2 or Case3, Case3 */
-
- .p2align 4
-L(CopyVecSizeCase2OrCase3):
- test %rdx, %rdx
- jnz L(CopyVecSizeCase2)
-L(CopyVecSizeCase3):
- add $VEC_SIZE, %r8
- add %rcx, %rdi
- add %rcx, %rsi
- jmp L(StrncpyExit)
-
- .p2align 4
-L(CopyTwoVecSizeCase2OrCase3):
- test %rdx, %rdx
- jnz L(CopyTwoVecSizeCase2)
- add %rcx, %rsi
- jmp L(StrncpyExit)
-
- .p2align 4
-L(CopyVecSizeTailCase2OrCase3):
- test %rdx, %rdx
- jnz L(CopyVecSizeTailCase2)
- add %rcx, %rsi
- jmp L(StrncpyExit)
-
- .p2align 4
-L(CopyTwoVecSize1Case2OrCase3):
- add $VEC_SIZE, %rdi
- add $VEC_SIZE, %rsi
- sub $VEC_SIZE, %r8
-L(CopyVecSizeTail1Case2OrCase3):
- test %rdx, %rdx
- jnz L(CopyVecSizeTail1Case2)
- jmp L(StrncpyExit)
+# if VEC_SIZE == 64
+ .p2align 4,, 6
+L(copy_32_63):
+ VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+ VMOVU %VMM_256(0), (%rdi)
+ VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
+ ret
+# endif
+
+
+ .p2align 4,, 6
+L(copy_16_31):
+ /* Use xmm1 explicitly here as it won't require a `vzeroupper`
+ and will save code size. */
+ vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+ VMOVU %VMM_128(0), (%rdi)
+ vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
+ ret
+
+ .p2align 4,, 8
+L(copy_8_15):
+# ifdef USE_AS_WCSCPY
+ movl -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
+# else
+ movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
+# endif
+ vmovq %VMM_128(0), (%rdi)
+ movq %rcx, -(8 - CHAR_SIZE)(%END_REG)
+ ret
# endif
-/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
- .p2align 4
-L(Exit1):
- movzwl (%rsi), %edx
- mov %dx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 1(%rdi), %rax
+# ifndef USE_AS_WCSCPY
+ .p2align 4,, 12
+L(copy_4_7):
+ movl -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
+ vmovd %VMM_128(0), (%rdi)
+ movl %ecx, -(4 - CHAR_SIZE)(%END_REG)
+ ret
# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $2, %r8
- lea 2(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
+
+
+ .p2align 4,, 8
+L(more_1x_vec):
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+ VMOVU %VMM(0), (%rdi)
# endif
- ret
+ subq %rsi, %rdi
+ andq $-(VEC_SIZE), %rsi
+ addq %rsi, %rdi
+ VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1)
- .p2align 4
-L(Exit2):
- movzwl (%rsi), %ecx
- mov %cx, (%rdi)
- movb $0, 2(%rdi)
+ /* Ideally we store after moves to minimize impact of potential
+ false-dependencies. */
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+ VMOVU %VMM(0), (%rax)
+# endif
+
+ VPTESTN %VMM(1), %VMM(1), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x1)
+
+ VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
+ VMOVU %VMM(1), VEC_SIZE(%rdi)
+
+ VPTESTN %VMM(2), %VMM(2), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x2)
+
+ VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3)
+ VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi)
+
+ VPTESTN %VMM(3), %VMM(3), %k0
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+ jnz L(ret_vec_x3)
+
+ VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
+ VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
+ VPTESTN %VMM(4), %VMM(4), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x4)
+
+ VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi)
+
+
+ /* Align for 4x loop. */
+ subq %rsi, %rdi
+
+ /* + VEC_SIZE * 5 because we never added the original VEC_SIZE
+ we covered before aligning. */
+ subq $-(VEC_SIZE * 5), %rsi
+ andq $-(VEC_SIZE * 4), %rsi
+
+
+ /* Load first half of the loop before entry. */
+ VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+ VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+ VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+ VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+ VPMIN %VMM(0), %VMM(1), %VMM(4)
+ VPMIN %VMM(2), %VMM(3), %VMM(6)
+ VPTESTN %VMM(4), %VMM(4), %k2
+ VPTESTN %VMM(6), %VMM(6), %k4
+ KORTEST %k2, %k4
+ jnz L(loop_4x_done)
+
+ .p2align 4,, 11
+L(loop_4x_vec):
+
+ VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi)
+ VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+ VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+ VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+
+ subq $(VEC_SIZE * -4), %rsi
+
+ VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+ VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+ VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+ VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+
+ VPMIN %VMM(0), %VMM(1), %VMM(4)
+ VPMIN %VMM(2), %VMM(3), %VMM(6)
+ VPTESTN %VMM(4), %VMM(4), %k2
+ VPTESTN %VMM(6), %VMM(6), %k4
+ KORTEST %k2, %k4
+ jz L(loop_4x_vec)
+
+L(loop_4x_done):
+ VPTESTN %VMM(0), %VMM(0), %k0
+ KMOV %k0, %VRCX
+ /* Restore rdi (%rdi). */
+ addq %rsi, %rdi
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x0_end)
+ VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+
+ KMOV %k2, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x1)
+ VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+
+ VPTESTN %VMM(2), %VMM(2), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x2)
+ VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+ /* Place L(ret_vec_x4) here to save code size. We get a
+ meaningfuly benefit doing this for stpcpy. */
+ KMOV %k4, %VRDX
+L(ret_vec_x3):
+ bsf %VRDX, %VRDX
+ VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+ VMOVU %VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
- lea 2(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $3, %r8
- lea 3(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
+ leaq (VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax
# endif
+L(return_end):
ret
- .p2align 4
-L(Exit3):
- mov (%rsi), %edx
- mov %edx, (%rdi)
+ .p2align 4,, 6
+L(ret_vec_x0_end):
+ bsf %VRCX, %VRCX
# ifdef USE_AS_STPCPY
- lea 3(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $4, %r8
- lea 4(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
+ leaq (%rdi, %rcx, CHAR_SIZE), %rax
# endif
+ inc %VRCX
+ VMOVU (-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+ VMOVU %VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
ret
- .p2align 4
-L(Exit4_7):
- mov (%rsi), %ecx
- mov %ecx, (%rdi)
- mov -3(%rsi, %rdx), %ecx
- mov %ecx, -3(%rdi, %rdx)
+ .p2align 4,, 8
+L(ret_vec_x1):
+ bsf %VRCX, %VRCX
+ VMOVU (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+ VMOVU %VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub %rdx, %r8
- sub $1, %r8
- lea 1(%rdi, %rdx), %rdi
- jnz L(StrncpyFillTailWithZero)
+ leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
# endif
ret
- .p2align 4
-L(Exit8_15):
- mov (%rsi), %rcx
- mov -7(%rsi, %rdx), %r9
- mov %rcx, (%rdi)
- mov %r9, -7(%rdi, %rdx)
+ .p2align 4,, 4
+L(ret_vec_x2):
+ bsf %VRCX, %VRCX
+ VMOVU ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+ VMOVU %VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub %rdx, %r8
- sub $1, %r8
- lea 1(%rdi, %rdx), %rdi
- jnz L(StrncpyFillTailWithZero)
+ leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
# endif
ret
- .p2align 4
-L(Exit16_31):
- VMOVU (%rsi), %XMM2
- VMOVU -15(%rsi, %rdx), %XMM3
- VMOVU %XMM2, (%rdi)
- VMOVU %XMM3, -15(%rdi, %rdx)
+ /* ret_vec_x3 reuses return code after the loop. */
+ .p2align 4,, 6
+L(ret_vec_x4):
+ bsf %VRCX, %VRCX
+ VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+ VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub %rdx, %r8
- sub $1, %r8
- lea 1(%rdi, %rdx), %rdi
- jnz L(StrncpyFillTailWithZero)
+ leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
# endif
ret
- .p2align 4
-L(Exit32_63):
- VMOVU (%rsi), %YMM2
- VMOVU -31(%rsi, %rdx), %YMM3
- VMOVU %YMM2, (%rdi)
- VMOVU %YMM3, -31(%rdi, %rdx)
-# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
+
+ .p2align 4,, 4
+L(page_cross):
+# ifndef USE_AS_STRCAT
+ vpxorq %VZERO_128, %VZERO_128, %VZERO_128
# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub %rdx, %r8
- sub $1, %r8
- lea 1(%rdi, %rdx), %rdi
- jnz L(StrncpyFillTailWithZero)
+ movq %rsi, %rcx
+ andq $(VEC_SIZE * -1), %rcx
+
+ VPCMPEQ (%rcx), %VZERO, %k0
+ KMOV %k0, %VRCX
+# ifdef USE_AS_WCSCPY
+ andl $(VEC_SIZE - 1), %PAGE_ALIGN_REG
+ shrl $2, %PAGE_ALIGN_REG
# endif
- ret
+ shrx %VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX
-# ifdef USE_AS_STRNCPY
+# if USE_MOVSB_IN_PAGE_CROSS
+ /* Optimizing more aggressively for space as this is very cold
+ code. This saves 2x cache lines. */
- .p2align 4
-L(StrncpyExit1):
- movzbl (%rsi), %edx
- mov %dl, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 1(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 1(%rdi)
+ /* This adds once to the later result which will get correct
+ copy bounds. NB: this can never zero-out a non-zero RCX as
+ to be in the page cross case rsi cannot be aligned and we
+ already right-shift rcx by the misalignment. */
+ shl %VRCX
+ jz L(page_cross_continue)
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+ movq %rdi, %rax
# endif
- ret
+ bsf %VRCX, %VRCX
+ REP_MOVS
- .p2align 4
-L(StrncpyExit2):
- movzwl (%rsi), %edx
- mov %dx, (%rdi)
# ifdef USE_AS_STPCPY
- lea 2(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 2(%rdi)
+ leaq -CHAR_SIZE(%rdi), %rax
# endif
ret
- .p2align 4
-L(StrncpyExit3_4):
- movzwl (%rsi), %ecx
- movzwl -2(%rsi, %r8), %edx
- mov %cx, (%rdi)
- mov %dx, -2(%rdi, %r8)
-# ifdef USE_AS_STPCPY
- lea (%rdi, %r8), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, (%rdi, %r8)
-# endif
- ret
- .p2align 4
-L(StrncpyExit5_8):
- mov (%rsi), %ecx
- mov -4(%rsi, %r8), %edx
- mov %ecx, (%rdi)
- mov %edx, -4(%rdi, %r8)
-# ifdef USE_AS_STPCPY
- lea (%rdi, %r8), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, (%rdi, %r8)
-# endif
- ret
+# else
+ /* Check if we found zero-char before end of page. */
+ test %VRCX, %VRCX
+ jz L(page_cross_continue)
- .p2align 4
-L(StrncpyExit9_16):
- mov (%rsi), %rcx
- mov -8(%rsi, %r8), %rdx
- mov %rcx, (%rdi)
- mov %rdx, -8(%rdi, %r8)
-# ifdef USE_AS_STPCPY
- lea (%rdi, %r8), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, (%rdi, %r8)
-# endif
- ret
+ /* Traditional copy case, essentially same as used in non-page-
+ cross case but since we can't reuse VMM(0) we need twice as
+ many loads from rsi. */
- .p2align 4
-L(StrncpyExit17_32):
- VMOVU (%rsi), %XMM2
- VMOVU -16(%rsi, %r8), %XMM3
- VMOVU %XMM2, (%rdi)
- VMOVU %XMM3, -16(%rdi, %r8)
-# ifdef USE_AS_STPCPY
- lea (%rdi, %r8), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, (%rdi, %r8)
+# ifndef USE_AS_STRCAT
+ xorl %edx, %edx
# endif
- ret
-
- .p2align 4
-L(StrncpyExit33_64):
- /* 0/32, 31/16 */
- VMOVU (%rsi), %YMM2
- VMOVU -VEC_SIZE(%rsi, %r8), %YMM3
- VMOVU %YMM2, (%rdi)
- VMOVU %YMM3, -VEC_SIZE(%rdi, %r8)
+ /* Dependency on rdi must already have been satisfied. */
+ bsf %VRCX, %VRDX
# ifdef USE_AS_STPCPY
- lea (%rdi, %r8), %rax
+ leaq (%rdi, %rdx, CHAR_SIZE), %rax
+# elif !defined USE_AS_STRCAT
+ movq %rdi, %rax
# endif
-# ifdef USE_AS_STRCAT
- movb $0, (%rdi, %r8)
-# endif
- ret
- .p2align 4
-L(StrncpyExit65):
- /* 0/32, 32/32, 64/1 */
- VMOVU (%rsi), %YMM2
- VMOVU 32(%rsi), %YMM3
- mov 64(%rsi), %cl
- VMOVU %YMM2, (%rdi)
- VMOVU %YMM3, 32(%rdi)
- mov %cl, 64(%rdi)
-# ifdef USE_AS_STPCPY
- lea 65(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 65(%rdi)
+# if VEC_SIZE == 64
+# ifdef USE_AS_WCSCPY
+ testb %cl, %cl
+# else
+ test %ecx, %ecx
+# endif
+ jz L(page_cross_copy_32_63)
# endif
- ret
-
-# ifndef USE_AS_STRCAT
- .p2align 4
-L(Fill1):
- mov %dl, (%rdi)
- ret
+# ifdef USE_AS_WCSCPY
+ testb $0xf, %cl
+# else
+ testw %cx, %cx
+# endif
+ jz L(page_cross_copy_16_31)
- .p2align 4
-L(Fill2):
- mov %dx, (%rdi)
- ret
+# ifdef USE_AS_WCSCPY
+ testb $0x3, %cl
+# else
+ testb %cl, %cl
+# endif
+ jz L(page_cross_copy_8_15)
- .p2align 4
-L(Fill3_4):
- mov %dx, (%rdi)
- mov %dx, -2(%rdi, %r8)
+# ifdef USE_AS_WCSCPY
+ movl (%rsi), %esi
+ movl %esi, (%rdi)
+ movl $0, (%END_REG)
ret
+# else
- .p2align 4
-L(Fill5_8):
- mov %edx, (%rdi)
- mov %edx, -4(%rdi, %r8)
- ret
+ testb $0x7, %cl
+ jz L(page_cross_copy_4_7)
- .p2align 4
-L(Fill9_16):
- mov %rdx, (%rdi)
- mov %rdx, -8(%rdi, %r8)
+ test %edx, %edx
+ jz L(page_cross_set_null_term)
+ movzwl (%rsi), %ecx
+ movw %cx, (%rdi)
+L(page_cross_set_null_term):
+ movb $0, (%END_REG)
ret
- .p2align 4
-L(Fill17_32):
- VMOVU %XMMZERO, (%rdi)
- VMOVU %XMMZERO, -16(%rdi, %r8)
- ret
- .p2align 4
-L(CopyVecSizeUnalignedVec2):
- VMOVU %YMM2, (%rdi, %rcx)
-
- .p2align 4
-L(CopyVecSizeVecExit):
- bsf %edx, %edx
- add $(VEC_SIZE - 1), %r8
- add %rcx, %rdi
-# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
- sub %rdx, %r8
- lea 1(%rdi, %rdx), %rdi
-
- .p2align 4
-L(StrncpyFillTailWithZero):
- xor %edx, %edx
- sub $VEC_SIZE, %r8
- jbe L(StrncpyFillExit)
-
- VMOVU %YMMZERO, (%rdi)
- add $VEC_SIZE, %rdi
-
- mov %rdi, %rsi
- and $(VEC_SIZE - 1), %esi
- sub %rsi, %rdi
- add %rsi, %r8
- sub $(VEC_SIZE * 4), %r8
- jb L(StrncpyFillLessFourVecSize)
-
-L(StrncpyFillLoopVmovdqa):
- VMOVA %YMMZERO, (%rdi)
- VMOVA %YMMZERO, VEC_SIZE(%rdi)
- VMOVA %YMMZERO, (VEC_SIZE * 2)(%rdi)
- VMOVA %YMMZERO, (VEC_SIZE * 3)(%rdi)
- add $(VEC_SIZE * 4), %rdi
- sub $(VEC_SIZE * 4), %r8
- jae L(StrncpyFillLoopVmovdqa)
-
-L(StrncpyFillLessFourVecSize):
- add $(VEC_SIZE * 2), %r8
- jl L(StrncpyFillLessTwoVecSize)
- VMOVA %YMMZERO, (%rdi)
- VMOVA %YMMZERO, VEC_SIZE(%rdi)
- add $(VEC_SIZE * 2), %rdi
- sub $VEC_SIZE, %r8
- jl L(StrncpyFillExit)
- VMOVA %YMMZERO, (%rdi)
- add $VEC_SIZE, %rdi
- jmp L(Fill)
-
- .p2align 4
-L(StrncpyFillLessTwoVecSize):
- add $VEC_SIZE, %r8
- jl L(StrncpyFillExit)
- VMOVA %YMMZERO, (%rdi)
- add $VEC_SIZE, %rdi
- jmp L(Fill)
-
- .p2align 4
-L(StrncpyFillExit):
- add $VEC_SIZE, %r8
-L(Fill):
- cmp $17, %r8d
- jae L(Fill17_32)
- cmp $9, %r8d
- jae L(Fill9_16)
- cmp $5, %r8d
- jae L(Fill5_8)
- cmp $3, %r8d
- jae L(Fill3_4)
- cmp $1, %r8d
- ja L(Fill2)
- je L(Fill1)
+ .p2align 4,, 4
+L(page_cross_copy_4_7):
+ movl (%rsi), %ecx
+ movl -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi
+ movl %ecx, (%rdi)
+ movl %esi, -(4 - CHAR_SIZE)(%END_REG)
ret
-
-/* end of ifndef USE_AS_STRCAT */
# endif
- .p2align 4
-L(UnalignedLeaveCase2OrCase3):
- test %rdx, %rdx
- jnz L(UnalignedFourVecSizeLeaveCase2)
-L(UnalignedFourVecSizeLeaveCase3):
- lea (VEC_SIZE * 4)(%r8), %rcx
- and $-VEC_SIZE, %rcx
- add $(VEC_SIZE * 3), %r8
- jl L(CopyVecSizeCase3)
- VMOVU %YMM4, (%rdi)
- sub $VEC_SIZE, %r8
- jb L(CopyVecSizeCase3)
- VMOVU %YMM5, VEC_SIZE(%rdi)
- sub $VEC_SIZE, %r8
- jb L(CopyVecSizeCase3)
- VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
- sub $VEC_SIZE, %r8
- jb L(CopyVecSizeCase3)
- VMOVU %YMM7, (VEC_SIZE * 3)(%rdi)
-# ifdef USE_AS_STPCPY
- lea (VEC_SIZE * 4)(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, (VEC_SIZE * 4)(%rdi)
-# endif
+# if VEC_SIZE == 64
+ .p2align 4,, 4
+L(page_cross_copy_32_63):
+ VMOVU (%rsi), %VMM_256(0)
+ VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+ VMOVU %VMM_256(0), (%rdi)
+ VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
ret
-
- .p2align 4
-L(UnalignedFourVecSizeLeaveCase2):
- xor %ecx, %ecx
- vpcmpb $0, %YMM4, %YMMZERO, %k1
- kmovd %k1, %edx
- add $(VEC_SIZE * 3), %r8
- jle L(CopyVecSizeCase2OrCase3)
- test %edx, %edx
-# ifndef USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec4)
-# else
- jnz L(CopyVecSize)
-# endif
- vpcmpb $0, %YMM5, %YMMZERO, %k2
- kmovd %k2, %edx
- VMOVU %YMM4, (%rdi)
- add $VEC_SIZE, %rcx
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
- test %edx, %edx
-# ifndef USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec5)
-# else
- jnz L(CopyVecSize)
# endif
- vpcmpb $0, %YMM6, %YMMZERO, %k3
- kmovd %k3, %edx
- VMOVU %YMM5, VEC_SIZE(%rdi)
- add $VEC_SIZE, %rcx
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
- test %edx, %edx
-# ifndef USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec6)
-# else
- jnz L(CopyVecSize)
-# endif
-
- vpcmpb $0, %YMM7, %YMMZERO, %k4
- kmovd %k4, %edx
- VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
- lea VEC_SIZE(%rdi, %rcx), %rdi
- lea VEC_SIZE(%rsi, %rcx), %rsi
- bsf %edx, %edx
- cmp %r8d, %edx
- jb L(CopyVecSizeExit)
-L(StrncpyExit):
- cmp $65, %r8d
- je L(StrncpyExit65)
- cmp $33, %r8d
- jae L(StrncpyExit33_64)
- cmp $17, %r8d
- jae L(StrncpyExit17_32)
- cmp $9, %r8d
- jae L(StrncpyExit9_16)
- cmp $5, %r8d
- jae L(StrncpyExit5_8)
- cmp $3, %r8d
- jae L(StrncpyExit3_4)
- cmp $1, %r8d
- ja L(StrncpyExit2)
- je L(StrncpyExit1)
-# ifdef USE_AS_STPCPY
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, (%rdi)
-# endif
+ .p2align 4,, 4
+L(page_cross_copy_16_31):
+ vmovdqu (%rsi), %xmm0
+ vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+ vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
ret
- .p2align 4
-L(ExitZero):
-# ifndef USE_AS_STRCAT
- mov %rdi, %rax
-# endif
+ .p2align 4,, 4
+L(page_cross_copy_8_15):
+ movq (%rsi), %rcx
+ movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
+ movq %rcx, (%rdi)
+ movq %rsi, -(8 - CHAR_SIZE)(%END_REG)
ret
-
-# endif
-
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# else
-END (STRCAT)
# endif
+END(STRCPY)
#endif
diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
index 203a19bf21..bced4e8944 100644
--- a/sysdeps/x86_64/multiarch/strncat-evex.S
+++ b/sysdeps/x86_64/multiarch/strncat-evex.S
@@ -1,7 +1,520 @@
-#ifndef STRNCAT
-# define STRNCAT __strncat_evex
-#endif
+/* {wcs|str}ncat with 256/512-bit EVEX.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+ /* Use evex-masked stores for small sizes. Turned off at the
+ moment. */
+# define USE_EVEX_MASKED_STORE 0
+
+# include <sysdep.h>
+
+# ifndef VEC_SIZE
+# include "x86-evex256-vecs.h"
+# endif
+
+# ifndef STRNCAT
+# define STRNCAT __strncat_evex
+# endif
+
+
+# ifdef USE_AS_WCSCPY
+# define MOVCHAR movl
+# define VMOVU_MASK vmovdqu32
+# define VPMIN vpminud
+# define VPTESTN vptestnmd
+# define VPTEST vptestmd
+# define VPCMPEQ vpcmpeqd
+# define CHAR_SIZE 4
+
+# define REP_MOVS rep movsd
+
+# define VMASK_REG VR10
+# define FIND_FIRST_ONE(src, dst) movl $CHAR_PER_VEC, %dst; bsf %src, %dst
+
+# define USE_WIDE_CHAR
+# else
+# define MOVCHAR movb
+# define VMOVU_MASK vmovdqu8
+# define VPMIN vpminub
+# define VPTESTN vptestnmb
+# define VPTEST vptestmb
+# define VPCMPEQ vpcmpeqb
+# define CHAR_SIZE 1
+
+# define REP_MOVS rep movsb
+
+# define VMASK_REG VRCX
+# define FIND_FIRST_ONE(src, dst) tzcnt %src, %dst
+
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# include "reg-macros.h"
+
+
+# define VZERO VMM(7)
+# define VZERO_128 VMM_128(7)
+
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+
+ .section SECTION(.text), "ax", @progbits
+ENTRY(STRNCAT)
+ movq %rdi, %rax
+
+ /* NB: It's safe to filter out zero-length strings WITHOUT
+ setting null-term. Destination MUST be a null-terminated
+ string so essentially the work is already done. */
+# ifdef USE_AS_WCSCPY
+ leaq -1(%rdx), %rcx
+ shrq $56, %rcx
+ jnz L(zero_len)
+# else
+ test %rdx, %rdx
+ jle L(zero_len)
+# endif
+
+# include "strcat-strlen-evex.h.S"
+
+ movl %esi, %ecx
+ andl $(PAGE_SIZE - 1), %ecx
+ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
+ ja L(page_cross)
+L(page_cross_continue):
+ VMOVU (%rsi), %VMM(0)
+ VPTESTN %VMM(0), %VMM(0), %k0
+
+ /* If USE_EVEX_MASK_STORE is enabled then we just handle length
+ <= CHAR_PER_VEC with masked instructions (which have
+ potential for dramatically bad perf if dst splits a page and
+ is not in the TLB). */
+# if USE_EVEX_MASKED_STORE
+ KMOV %k0, %VRCX
+ FIND_FIRST_ONE (VRCX, VR8)
+ cmpq %r8, %rdx
+ jbe L(less_1x_vec)
+
+ test %VRCX, %VRCX
+ jz L(more_1x_vec)
+
+ blsmsk %VRCX, %VRCX
+ KMOV %VRCX, %k1
+ VMOVU_MASK %VMM(0), (%rdi){%k1}
+ ret
+
+L(less_1x_vec):
+ mov $-1, %VRCX
+ bzhi %VRDX, %VRCX, %VRCX
+ KMOV %VRCX, %k1
+ MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
+ VMOVU_MASK %VMM(0), (%rdi){%k1}
+
+ ret
+# else
+ KMOV %k0, %VMASK_REG
+ /* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
+ %VMASK_REG, %VRCX` for wcsncat. */
+ FIND_FIRST_ONE (VMASK_REG, VRCX)
+ cmpq %rcx, %rdx
+ jbe L(less_1x_vec)
+
+ /* If there were no zero-CHARs (rcx was zero before
+ FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */
+ cmpl $CHAR_PER_VEC, %ecx
+ je L(more_1x_vec)
+
+ movl %ecx, %edx
+
+L(less_1x_vec):
+# if VEC_SIZE == 64
+ cmpl $(32 / CHAR_SIZE), %edx
+ jae L(copy_32_63)
+# endif
+
+ cmpl $(16 / CHAR_SIZE), %edx
+ jae L(copy_16_31)
+
+
+ cmpl $(8 / CHAR_SIZE), %edx
+ jae L(copy_8_15)
+
+# ifdef USE_AS_WCSCPY
+ vmovd %VMM_128(0), (%rdi)
+ MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
+ ret
+# else
+
+ cmpl $4, %edx
+ jae L(copy_4_7)
+
+ movzbl (%rsi), %ecx
+ cmpl $1, %edx
+ jbe L(set_null_term)
+
+ movzwl 1(%rsi), %esi
+ movw %si, 1(%rdi)
+
+ .p2align 4,, 1
+L(set_null_term):
+ movb %cl, (%rdi)
+ MOVCHAR $0, (%rdi, %rdx)
+ ret
+# endif
+
+# if VEC_SIZE == 64
+ .p2align 4,, 6
+L(copy_32_63):
+ VMOVU -(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+ VMOVU %VMM_256(0), (%rdi)
+ VMOVU %VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE)
+ MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
+ ret
+# endif
+ .p2align 4,, 6
+L(copy_16_31):
+ /* Use xmm1 explicitly here as it won't require a `vzeroupper`
+ and will save code size. */
+ vmovdqu -(16)(%rsi, %rdx, CHAR_SIZE), %xmm1
+ VMOVU %VMM_128(0), (%rdi)
+ vmovdqu %xmm1, -(16)(%rdi, %rdx, CHAR_SIZE)
+ MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
+ ret
+
+ .p2align 4,, 2
+L(copy_8_15):
+ movq -(8)(%rsi, %rdx, CHAR_SIZE), %rcx
+ vmovq %VMM_128(0), (%rdi)
+ movq %rcx, -(8)(%rdi, %rdx, CHAR_SIZE)
+ MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
+ ret
+
+# ifndef USE_AS_WCSCPY
+ .p2align 4,, 12
+L(copy_4_7):
+ movl -(4)(%rsi, %rdx, CHAR_SIZE), %ecx
+ vmovd %VMM_128(0), (%rdi)
+ movl %ecx, -(4)(%rdi, %rdx, CHAR_SIZE)
+ MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
+ ret
+# endif
+
+# endif
+ .p2align 4,, 4
+L(zero_len):
+# ifdef USE_AS_WCSCPY
+ test %rdx, %rdx
+# endif
+ jne OVERFLOW_STRCAT
+ ret
-#define USE_AS_STRNCAT
-#define STRCAT STRNCAT
-#include "strcat-evex.S"
+ .p2align 4,, 8
+L(more_1x_vec):
+ VMOVU %VMM(0), (%rdi)
+
+ /* We are going to align rsi here so will need to be able to re-
+ adjust rdi/rdx afterwords. NB: We filtered out huge lengths
+ so rsi + rdx * CHAR_SIZE cannot overflow. */
+
+ leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
+ subq %rsi, %rdi
+ andq $-(VEC_SIZE), %rsi
+L(loop_last_4x_vec):
+ addq %rsi, %rdi
+ subq %rsi, %rdx
+# ifdef USE_AS_WCSCPY
+ shrq $2, %rdx
+# endif
+
+ /* Will need this regardless. */
+ VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1)
+ VPTESTN %VMM(1), %VMM(1), %k0
+ KMOV %k0, %VMASK_REG
+
+ cmpq $(CHAR_PER_VEC * 2), %rdx
+ ja L(more_2x_vec)
+
+L(last_2x_vec):
+ FIND_FIRST_ONE (VMASK_REG, VRCX)
+ cmpl %ecx, %edx
+ jbe L(ret_vec_x1_len)
+
+ /* If there were no zero-CHARs (rcx was zero before
+ FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */
+ cmpl $CHAR_PER_VEC, %ecx
+ jne L(ret_vec_x1)
+
+ VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
+ VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
+ VPTESTN %VMM(2), %VMM(2), %k0
+ KMOV %k0, %VRCX
+ addl $-CHAR_PER_VEC, %edx
+ bzhi %VRDX, %VRCX, %VR8
+ jz L(ret_vec_x2_len)
+L(ret_vec_x2):
+ bsf %VRCX, %VRDX
+L(ret_vec_x2_len):
+ VMOVU (VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+ MOVCHAR $0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
+ VMOVU %VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
+ ret
+
+ .p2align 4,, 4
+L(ret_vec_x1_len):
+ movl %edx, %ecx
+L(ret_vec_x1):
+ VMOVU (VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+ MOVCHAR $0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE)
+ VMOVU %VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
+ VZEROUPPER_RETURN
+
+
+ .p2align 4,, 8
+L(last_4x_vec):
+ addl $-(CHAR_PER_VEC * 4), %edx
+ VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1)
+ VPTESTN %VMM(1), %VMM(1), %k0
+ KMOV %k0, %VMASK_REG
+ subq $-(VEC_SIZE * 4), %rsi
+ subq $-(VEC_SIZE * 4), %rdi
+ cmpl $(CHAR_PER_VEC * 2), %edx
+ jbe L(last_2x_vec)
+ .p2align 4,, 8
+L(more_2x_vec):
+# ifdef USE_AS_WCSCPY
+ xorl %ecx, %ecx
+# endif
+ bsf %VMASK_REG, %VRCX
+ jnz L(ret_vec_x1)
+
+ VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
+ VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
+ VPTESTN %VMM(2), %VMM(2), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x2)
+
+ VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3)
+ VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi)
+ VPTESTN %VMM(3), %VMM(3), %k0
+ KMOV %k0, %VMASK_REG
+
+ cmpq $(CHAR_PER_VEC * 4), %rdx
+ ja L(more_4x_vec)
+
+ /* Adjust length before going to L(ret_vec_x3_len) or
+ L(ret_vec_x3). */
+ addl $(CHAR_PER_VEC * -2), %edx
+
+ FIND_FIRST_ONE (VMASK_REG, VRCX)
+ cmpl %ecx, %edx
+ jbe L(ret_vec_x3_len)
+
+ /* If there were no zero-CHARs (rcx was zero before
+ FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */
+ cmpl $CHAR_PER_VEC, %ecx
+ jne L(ret_vec_x3)
+
+ VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
+ VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
+ VPTESTN %VMM(4), %VMM(4), %k0
+ KMOV %k0, %VRCX
+ addl $-CHAR_PER_VEC, %edx
+ bzhi %VRDX, %VRCX, %VR8
+ jz L(ret_vec_x4_len)
+L(ret_vec_x4):
+ bsf %VRCX, %VRDX
+L(ret_vec_x4_len):
+ VMOVU (VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+ MOVCHAR $0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE)
+ VMOVU %VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
+ ret
+
+ .p2align 4,, 4
+L(ret_vec_x3_len):
+ movl %edx, %ecx
+L(ret_vec_x3):
+ VMOVU (VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+ MOVCHAR $0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
+ VMOVU %VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
+ ret
+
+ .p2align 4,, 8
+L(more_4x_vec):
+# ifdef USE_AS_WCSCPY
+ xorl %ecx, %ecx
+# endif
+ bsf %VMASK_REG, %VRCX
+ jnz L(ret_vec_x3)
+
+ VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
+ VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
+ VPTESTN %VMM(4), %VMM(4), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x4)
+
+ VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi)
+
+ /* Check if we are near the end before aligning. */
+ cmpq $(CHAR_PER_VEC * 8), %rdx
+ jbe L(last_4x_vec)
+
+
+ /* Add rsi to rdx (length) before aligning rsi. NB: Since we
+ filtered out huge lengths this cannot overflow. */
+# ifdef USE_AS_WCSCPY
+ leaq (%rsi, %rdx, CHAR_SIZE), %rdx
+# else
+ addq %rsi, %rdx
+# endif
+
+ /* Subtract rsi from rdi before aligning (add back will have
+ correct rdi for aligned rsi). */
+ subq %rsi, %rdi
+ subq $-(VEC_SIZE * 5), %rsi
+ andq $(VEC_SIZE * -4), %rsi
+
+ /* Load first half of the loop before entry. */
+ VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+ VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+ VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+ VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+ VPMIN %VMM(0), %VMM(1), %VMM(4)
+ VPMIN %VMM(2), %VMM(3), %VMM(6)
+ VPTESTN %VMM(4), %VMM(4), %k2
+ VPTESTN %VMM(6), %VMM(6), %k4
+
+ /* Offset rsi by VEC_SIZE so that we can jump to
+ L(loop_last_4x_vec). */
+ addq $-(VEC_SIZE), %rsi
+ KORTEST %k2, %k4
+ jnz L(loop_4x_done)
+
+ /* Store loop end in r9. */
+ leaq -(VEC_SIZE * 5)(%rdx), %r9
+
+ .p2align 4,, 11
+L(loop_4x_vec):
+ VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+ VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+ VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+ VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
+
+ subq $(VEC_SIZE * -4), %rsi
+ cmpq %rsi, %r9
+ jbe L(loop_last_4x_vec)
+
+ VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
+ VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
+ VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
+ VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
+
+ VPMIN %VMM(0), %VMM(1), %VMM(4)
+ VPMIN %VMM(2), %VMM(3), %VMM(6)
+ VPTESTN %VMM(4), %VMM(4), %k2
+ VPTESTN %VMM(6), %VMM(6), %k4
+ KORTEST %k2, %k4
+ jz L(loop_4x_vec)
+
+L(loop_4x_done):
+ VPTESTN %VMM(0), %VMM(0), %k0
+ KMOV %k0, %VRCX
+ /* Restore rdi (dst). */
+ addq %rsi, %rdi
+
+ /* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
+ test with bsf. */
+ bsf %VRCX, %VRCX
+ jnz L(ret_vec_x1)
+ VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
+
+ KMOV %k2, %VRCX
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x2)
+ VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
+
+ VPTESTN %VMM(2), %VMM(2), %k0
+ KMOV %k0, %VRCX
+ bsf %VRCX, %VRCX
+ jnz L(ret_vec_x3)
+ VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
+
+ KMOV %k4, %VRCX
+ bsf %VRCX, %VRCX
+ VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
+ VMOVU %VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
+ ret
+
+
+ .p2align 4,, 4
+L(page_cross):
+ movq %rsi, %r8
+ andq $(VEC_SIZE * -1), %r8
+ VPCMPEQ (%r8), %VZERO, %k0
+
+# ifdef USE_AS_WCSCPY
+ KMOV %k0, %VR9
+ shrl $2, %ecx
+ andl $(CHAR_PER_VEC - 1), %ecx
+ shrx %VRCX, %VR9, %VRCX
+# else
+ KMOV %k0, %VRCX
+ shrx %VRSI, %VRCX, %VRCX
+# endif
+
+ subl %esi, %r8d
+ andl $(VEC_SIZE - 1), %r8d
+# ifdef USE_AS_WCSCPY
+ shrl $2, %r8d
+# endif
+ cmpq %r8, %rdx
+ jbe L(page_cross_small)
+ /* Optimizing more for space as this is very cold code. This
+ saves 2x cache lines. */
+
+ /* This adds once to the later result which will get correct
+ copy bounds. NB: this can never zero-out a non-zero RCX as
+ to be in the page cross case rsi cannot be aligned and we
+ already right-shift rcx by the misalignment. */
+ shl %VRCX
+ jz L(page_cross_continue)
+ bsf %VRCX, %VRCX
+ REP_MOVS
+ ret
+
+L(page_cross_small):
+ tzcnt %VRCX, %VRCX
+ jz L(page_cross_setz)
+ cmpl %edx, %ecx
+ cmova %edx, %ecx
+
+# ifdef USE_AS_WCSCPY
+ rep movsd
+# else
+ rep movsb
+# endif
+L(page_cross_setz):
+ MOVCHAR $0, (%rdi)
+ ret
+END(STRNCAT)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
index 1b3426d511..49eaf4cbd9 100644
--- a/sysdeps/x86_64/multiarch/strncpy-evex.S
+++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
@@ -1,7 +1,990 @@
-#ifndef STRNCPY
-# define STRNCPY __strncpy_evex
-#endif
+/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+ /* Use evex-masked stores for small sizes. Turned off at the
+ moment. */
+# define USE_EVEX_MASKED_STORE 0
+
+
+# include <sysdep.h>
+# ifndef VEC_SIZE
+# include "x86-evex256-vecs.h"
+# endif
+
+
+# ifndef STRNCPY
+# define STRNCPY __strncpy_evex
+# endif
+
+# ifdef USE_AS_WCSCPY
+# define VMOVU_MASK vmovdqu32
+# define VPCMPEQ vpcmpeqd
+# define VPMIN vpminud
+# define VPTESTN vptestnmd
+# define VPTEST vptestmd
+# define CHAR_SIZE 4
+
+# define REP_MOVS rep movsd
+# define REP_STOS rep stosl
+
+# define USE_WIDE_CHAR
+
+# else
+# define VMOVU_MASK vmovdqu8
+# define VPCMPEQ vpcmpeqb
+# define VPMIN vpminub
+# define VPTESTN vptestnmb
+# define VPTEST vptestmb
+# define CHAR_SIZE 1
+
+# define REP_MOVS rep movsb
+# define REP_STOS rep stosb
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+
+# include "reg-macros.h"
+
+
+# define VZERO VMM(7)
+# define VZERO_256 VMM_256(7)
+# define VZERO_128 VMM_128(7)
+
+# if VEC_SIZE == 64
+# define VZERO_HALF VZERO_256
+# else
+# define VZERO_HALF VZERO_128
+# endif
+
+ .section SECTION(.text), "ax", @progbits
+ENTRY(STRNCPY)
+ /* Filter zero length strings and very long strings. Zero
+ length strings just return, very long strings are handled by
+ just running rep stos{b|l} to zero set (which will almost
+ certainly segfault), if that succeeds then just calling
+ OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */
+# ifdef USE_AS_WCSCPY
+ decq %rdx
+ movq %rdx, %rax
+ /* 56 is end of max supported address space. */
+ shr $56, %rax
+ jnz L(zero_len)
+# else
+ decq %rdx
+ /* If the flag needs to become `jb` replace `dec` with `sub`.
+ */
+ jl L(zero_len)
+# endif
+
+ vpxorq %VZERO_128, %VZERO_128, %VZERO_128
+ movl %esi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(page_cross)
+
+L(page_cross_continue):
+ VMOVU (%rsi), %VMM(0)
+ VPTESTN %VMM(0), %VMM(0), %k0
+ KMOV %k0, %VRCX
+
+ /* If no STPCPY just save end ahead of time. */
+# ifndef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+
+
+ cmpq $(CHAR_PER_VEC), %rdx
+
+ /* If USE_EVEX_MASK_STORE is enabled then we just handle length
+ <= CHAR_PER_VEC with masked instructions (which have
+ potential for dramatically bad perf if dst splits a page and
+ is not in the TLB). */
+# if USE_EVEX_MASKED_STORE
+ /* `jae` because length rdx is now length - 1. */
+ jae L(more_1x_vec)
+
+ /* If there where multiple zero-CHAR matches in the first VEC,
+ VRCX will be overset but thats fine since any oversets where
+ at zero-positions anyways. */
+
+# ifdef USE_AS_STPCPY
+ tzcnt %VRCX, %VRAX
+ cmpl %eax, %edx
+ cmovb %edx, %eax
+# ifdef USE_AS_WCSCPY
+ adcl $0, %eax
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
+ adcq %rdi, %rax
+# endif
+# endif
+ dec %VRCX
+
+ /* Zero out all non-zero CHAR's after the first zero match. */
+ KMOV %VRCX, %k1
+
+ /* Use VZERO as destination so this can be reused for
+ L(zfill_less_vec) (which if jumped to by subsequent logic
+ will have zerod out VZERO. */
+ VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
+L(zfill_less_vec):
+ /* Get mask for what we need to set. */
+ incl %edx
+ mov $-1, %VRCX
+ bzhi %VRDX, %VRCX, %VRCX
+ KMOV %VRCX, %k1
+ VMOVU_MASK %VZERO, (%rdi){%k1}
+ ret
+
+ .p2align 4,, 4
+L(zero_len):
+ cmpq $-1, %rdx
+ jne L(best_effort_strncpy)
+ movq %rdi, %rax
+ ret
+
+ .p2align 4,, 8
+L(more_1x_vec):
+# else
+ /* `jb` because length rdx is now length - 1. */
+ jb L(less_1x_vec)
+# endif
+
+
+ /* This may overset but thats fine because we still need to zero
+ fill. */
+ VMOVU %VMM(0), (%rdi)
+
+
+ /* Length must be >= CHAR_PER_VEC so match here means we must
+ zero-fill. */
+ test %VRCX, %VRCX
+ jnz L(zfill)
+
+
+ /* We are going to align rsi here so will need to be able to re-
+ adjust rdi/rdx afterwords. NB: We filtered out huge lengths
+ so rsi + rdx * CHAR_SIZE cannot overflow. */
+ leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
+ subq %rsi, %rdi
+ andq $-(VEC_SIZE), %rsi
+
+L(loop_last_4x_vec):
+ addq %rsi, %rdi
+ subq %rsi, %rdx
+# ifdef USE_AS_WCSCPY
+ shrq $2, %rdx
+# endif
+
+ VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1)
+ VPTESTN %VMM(1), %VMM(1), %k0
+ KMOV %k0, %VRCX
+
+ /* -1 because of the `dec %rdx` earlier. */
+ cmpq $(CHAR_PER_VEC * 2 - 1), %rdx
+ ja L(more_2x_vec)
+
+L(last_2x_vec):
+ /* This will be need to be computed no matter what. We do it
+ ahead of time for CHAR_PER_VEC == 64 because we can't adjust
+ the value of `tzcnt` with a shift. */
+# if CHAR_PER_VEC == 64
+ tzcntq %rcx, %rcx
+# endif
+
+ cmpl $(CHAR_PER_VEC), %edx
+ jb L(ret_vec_x1_len)
+
+ /* Seperate logic for CHAR_PER_VEC == 64 because we already did
+ `tzcnt` on VRCX. */
+# if CHAR_PER_VEC == 64
+ /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`. */
+ cmpb $CHAR_PER_VEC, %cl
+ jnz L(ret_vec_x1_no_bsf)
+# else
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x1)
+# endif
+
+
+
+ VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0
+ VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
+ KMOV %k0, %VRCX
+
+# if CHAR_PER_VEC < 64
+ /* This essentiallys adds CHAR_PER_VEC to computed result. */
+ shlq $CHAR_PER_VEC, %rcx
+# else
+ tzcntq %rcx, %rcx
+ addl $CHAR_PER_VEC, %ecx
+# endif
+
+ .p2align 4,, 4
+L(ret_vec_x1_len):
+ /* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
+ already been done. */
+# if CHAR_PER_VEC < 64
+ tzcntq %rcx, %rcx
+# endif
+ cmpl %ecx, %edx
+ jbe L(ret_vec_x1_len_no_zfill)
+ /* Fall through (expectation) is copy len < buffer len. */
+ VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+L(ret_vec_x1_len_no_zfill_mov):
+ movl %ecx, %edx
+# ifdef USE_AS_STPCPY
+ /* clear flags. */
+ xorl %ecx, %ecx
+# endif
+L(ret_vec_x1_len_no_zfill):
+ VMOVU ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+ VMOVU %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+# ifdef USE_AS_WCSCPY
+ adcq $0, %rdx
+ leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
+# else
+ leal (VEC_SIZE)(%rdx), %eax
+ adcq %rdi, %rax
+# endif
+# endif
+ ret
+
+
+ .p2align 4,, 10
+L(ret_vec_x1):
+ bsf %VRCX, %VRCX
+L(ret_vec_x1_no_bsf):
+ VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+ subl %ecx, %edx
+ cmpl $CHAR_PER_VEC, %edx
+ jb L(ret_vec_x1_len_no_zfill_mov)
+ /* Fall through (expectation) is copy len < buffer len. */
+ VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
+ VMOVU %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+ leaq (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
+# endif
+ ret
+
+ .p2align 4,, 8
+L(last_4x_vec):
+ /* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
+ $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
+ using `movzbl`. */
+# if CHAR_PER_VEC == 64
+ movzbl %dl, %edx
+# else
+ andl $(CHAR_PER_VEC * 4 - 1), %edx
+# endif
+ VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1)
+ VPTESTN %VMM(1), %VMM(1), %k0
+ KMOV %k0, %VRCX
+ subq $-(VEC_SIZE * 4), %rsi
+ subq $-(VEC_SIZE * 4), %rdi
+ cmpl $(CHAR_PER_VEC * 2 - 1), %edx
+ jbe L(last_2x_vec)
+ .p2align 4,, 8
+L(more_2x_vec):
+ VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
+ test %VRCX, %VRCX
+ /* Must fill at least 2x VEC. */
+ jnz L(zfill_vec1)
+
+ VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
+ VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi)
+ VPTESTN %VMM(2), %VMM(2), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ /* Must fill at least 1x VEC. */
+ jnz L(zfill_vec2)
+
+ VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3)
+ VPTESTN %VMM(3), %VMM(3), %k0
+ KMOV %k0, %VRCX
+
+ /* Check if len is more 4x VEC. -1 because rdx is len - 1. */
+ cmpq $(CHAR_PER_VEC * 4 - 1), %rdx
+ ja L(more_4x_vec)
+
+ subl $(CHAR_PER_VEC * 3), %edx
+ jb L(ret_vec_x3_len)
+
+ test %VRCX, %VRCX
+ jnz L(ret_vec_x3)
+
+ VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0
+ VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
+ KMOV %k0, %VRCX
+ tzcnt %VRCX, %VRCX
+ cmpl %ecx, %edx
+ jbe L(ret_vec_x4_len_no_zfill)
+ /* Fall through (expectation) is copy len < buffer len. */
+ VMOVU %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+ movl %ecx, %edx
+L(ret_vec_x4_len_no_zfill):
+ VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+ VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+# ifdef USE_AS_WCSCPY
+ adcq $0, %rdx
+ leaq (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
+# else
+ leal (VEC_SIZE * 4 + 0)(%rdx), %eax
+ adcq %rdi, %rax
+# endif
+# endif
+ ret
+
+
+L(ret_vec_x3_len):
+ addl $(CHAR_PER_VEC * 1), %edx
+ tzcnt %VRCX, %VRCX
+ cmpl %ecx, %edx
+ jbe L(ret_vec_x3_len_no_zfill)
+ /* Fall through (expectation) is copy len < buffer len. */
+ VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+L(ret_vec_x3_len_no_zfill_mov):
+ movl %ecx, %edx
+# ifdef USE_AS_STPCPY
+ /* clear flags. */
+ xorl %ecx, %ecx
+# endif
+ .p2align 4,, 4
+L(ret_vec_x3_len_no_zfill):
+ VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+ VMOVU %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+# ifdef USE_AS_WCSCPY
+ adcq $0, %rdx
+ leaq (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
+# else
+ leal (VEC_SIZE * 3 + 0)(%rdx), %eax
+ adcq %rdi, %rax
+# endif
+# endif
+ ret
+
+
+ .p2align 4,, 8
+L(ret_vec_x3):
+ bsf %VRCX, %VRCX
+ VMOVU %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
+ subl %ecx, %edx
+ jl L(ret_vec_x3_len_no_zfill_mov)
+ VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
+ VMOVU %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+ leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
+# endif
+ ret
+
+ .p2align 4,, 8
+L(more_4x_vec):
+ VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
+ test %VRCX, %VRCX
+ jnz L(zfill_vec3)
+
+ VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
+ VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi)
+ VPTESTN %VMM(4), %VMM(4), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(zfill_vec4)
-#define USE_AS_STRNCPY
-#define STRCPY STRNCPY
-#include "strcpy-evex.S"
+ /* Recheck length before aligning. */
+ cmpq $(CHAR_PER_VEC * 8 - 1), %rdx
+ jbe L(last_4x_vec)
+
+ /* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi. */
+# ifdef USE_AS_WCSCPY
+ leaq (%rsi, %rdx, CHAR_SIZE), %rdx
+# else
+ addq %rsi, %rdx
+# endif
+ subq %rsi, %rdi
+ subq $-(VEC_SIZE * 5), %rsi
+ andq $(VEC_SIZE * -4), %rsi
+
+
+ /* Load first half of the loop before entry. */
+ VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+ VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+ VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+ VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+ VPMIN %VMM(0), %VMM(1), %VMM(4)
+ VPMIN %VMM(2), %VMM(3), %VMM(6)
+ VPTESTN %VMM(4), %VMM(4), %k2
+ VPTESTN %VMM(6), %VMM(6), %k4
+
+
+ /* Offset rsi by VEC_SIZE so that we can jump to
+ L(loop_last_4x_vec). */
+ addq $-(VEC_SIZE), %rsi
+ KORTEST %k2, %k4
+ jnz L(loop_4x_done)
+
+ /* Store loop end in r9. */
+ leaq -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
+
+ .p2align 4,, 11
+L(loop_4x_vec):
+ VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+ VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+ VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+ VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
+
+ subq $(VEC_SIZE * -4), %rsi
+ cmpq %rsi, %r9
+ jbe L(loop_last_4x_vec)
+
+ VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
+ VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
+ VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
+ VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
+
+ VPMIN %VMM(0), %VMM(1), %VMM(4)
+ VPMIN %VMM(2), %VMM(3), %VMM(6)
+ VPTESTN %VMM(4), %VMM(4), %k2
+ VPTESTN %VMM(6), %VMM(6), %k4
+ KORTEST %k2, %k4
+ jz L(loop_4x_vec)
+
+L(loop_4x_done):
+ /* Restore rdx (length). */
+ subq %rsi, %rdx
+# ifdef USE_AS_WCSCPY
+ shrq $2, %rdx
+# endif
+ VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+ /* Restore rdi (dst). */
+ addq %rsi, %rdi
+ VPTESTN %VMM(0), %VMM(0), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(zfill_vec1)
+
+ VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
+ KMOV %k2, %VRCX
+ test %VRCX, %VRCX
+ jnz L(zfill_vec2)
+
+ VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
+ VPTESTN %VMM(2), %VMM(2), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(zfill_vec3)
+
+ VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
+ KMOV %k4, %VRCX
+ // Zfill more....
+
+ .p2align 4,, 4
+L(zfill_vec4):
+ subq $(VEC_SIZE * -2), %rdi
+ addq $(CHAR_PER_VEC * -2), %rdx
+L(zfill_vec2):
+ subq $(VEC_SIZE * -2), %rdi
+ addq $(CHAR_PER_VEC * -1), %rdx
+L(zfill):
+ /* VRCX must be non-zero. */
+ bsf %VRCX, %VRCX
+
+ /* Adjust length / dst for zfill. */
+ subq %rcx, %rdx
+# ifdef USE_AS_WCSCPY
+ leaq (%rdi, %rcx, CHAR_SIZE), %rdi
+# else
+ addq %rcx, %rdi
+# endif
+# ifdef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+L(zfill_from_page_cross):
+
+ /* From here on out its just memset(rdi, 0, rdx). */
+ cmpq $CHAR_PER_VEC, %rdx
+ jb L(zfill_less_vec)
+
+L(zfill_more_1x_vec):
+ VMOVU %VZERO, (%rdi)
+ VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
+ cmpq $(CHAR_PER_VEC * 2 - 1), %rdx
+ ja L(zfill_more_2x_vec)
+L(zfill_done0):
+ ret
+
+ /* Coming from vec1/vec2 we must be able to zfill at least 2x
+ VEC. */
+ .p2align 4,, 8
+L(zfill_vec3):
+ subq $(VEC_SIZE * -2), %rdi
+ addq $(CHAR_PER_VEC * -2), %rdx
+ .p2align 4,, 2
+L(zfill_vec1):
+ bsfq %rcx, %rcx
+ /* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
+ */
+ leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
+ subq %rcx, %rdx
+# ifdef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+
+
+ VMOVU %VZERO, (%rdi)
+ VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
+ cmpq $(CHAR_PER_VEC * 2), %rdx
+ jb L(zfill_done0)
+L(zfill_more_2x_vec):
+ VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
+ VMOVU %VZERO, (VEC_SIZE)(%rdi)
+ subq $(CHAR_PER_VEC * 4 - 1), %rdx
+ jbe L(zfill_done)
+
+# ifdef USE_AS_WCSCPY
+ leaq (%rdi, %rdx, CHAR_SIZE), %rdx
+# else
+ addq %rdi, %rdx
+# endif
+
+ VMOVU %VZERO, (VEC_SIZE * 2)(%rdi)
+ VMOVU %VZERO, (VEC_SIZE * 3)(%rdi)
+
+
+ VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
+ VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
+
+ subq $-(VEC_SIZE * 4), %rdi
+ cmpq %rdi, %rdx
+ jbe L(zfill_done)
+
+ /* Align rdi and zfill loop. */
+ andq $-(VEC_SIZE), %rdi
+ .p2align 4,, 12
+L(zfill_loop_4x_vec):
+ VMOVA %VZERO, (VEC_SIZE * 0)(%rdi)
+ VMOVA %VZERO, (VEC_SIZE * 1)(%rdi)
+ VMOVA %VZERO, (VEC_SIZE * 2)(%rdi)
+ VMOVA %VZERO, (VEC_SIZE * 3)(%rdi)
+ subq $-(VEC_SIZE * 4), %rdi
+ cmpq %rdi, %rdx
+ ja L(zfill_loop_4x_vec)
+L(zfill_done):
+ ret
+
+
+ /* Less 1x VEC case if we are not using evex masked store. */
+# if !USE_EVEX_MASKED_STORE
+ .p2align 4,, 8
+L(copy_1x):
+ /* Special case for copy 1x. It can be handled quickly and many
+ buffer sizes have convenient alignment. */
+ VMOVU %VMM(0), (%rdi)
+ /* If no zeros then we are done. */
+ testl %ecx, %ecx
+ jz L(ret_1x_1x)
+
+ /* Need to zfill, not we know that length <= CHAR_PER_VEC so we
+ only handle the small case here. */
+ bsf %VRCX, %VRCX
+L(zfill_less_vec_no_bsf):
+ /* Adjust length / dst then just zfill less_vec. */
+ subq %rcx, %rdx
+# ifdef USE_AS_WCSCPY
+ leaq (%rdi, %rcx, CHAR_SIZE), %rdi
+# else
+ addq %rcx, %rdi
+# endif
+# ifdef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+
+L(zfill_less_vec):
+ cmpl $((VEC_SIZE / 2) / CHAR_SIZE), %edx
+ jb L(zfill_less_half)
+
+ VMOVU %VZERO_HALF, (%rdi)
+ VMOVU %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+ ret
+# ifdef USE_AS_STPCPY
+L(ret_1x_1x):
+ leaq CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
+ ret
+# endif
+
+
+# if VEC_SIZE == 64
+ .p2align 4,, 4
+L(copy_32_63):
+ /* Overfill to avoid branches. */
+ VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+ VMOVU %VMM_256(0), (%rdi)
+ VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+
+ /* We are taking advantage of the fact that to be here we must
+ be writing null-term as (%rdi, %rcx) we have a byte of lee-
+ way for overwriting. */
+ cmpl %ecx, %edx
+ ja L(zfill_less_vec_no_bsf)
+# ifndef USE_AS_STPCPY
+L(ret_1x_1x):
+# else
+# ifdef USE_AS_WCSCPY
+ adcq $0, %rdx
+ leaq (%rdi, %rdx, CHAR_SIZE), %rax
+# else
+ movl %edx, %eax
+ adcq %rdi, %rax
+# endif
+# endif
+ ret
+# endif
+
+ .p2align 4,, 4
+L(copy_16_31):
+ /* Overfill to avoid branches. */
+ vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+ VMOVU %VMM_128(0), (%rdi)
+ vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+ cmpl %ecx, %edx
+
+ /* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
+ we have a larger copy block for 32-63 so this is just falls
+ through to zfill 16-31. If VEC_SIZE == 32 then we check for
+ full zfill of less 1x VEC. */
+# if VEC_SIZE == 64
+ jbe L(ret_16_31)
+ subl %ecx, %edx
+# ifdef USE_AS_WCSCPY
+ leaq (%rdi, %rcx, CHAR_SIZE), %rdi
+# else
+ addq %rcx, %rdi
+# endif
+# ifdef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+L(zfill_less_half):
+L(zfill_less_32):
+ cmpl $(16 / CHAR_SIZE), %edx
+ jb L(zfill_less_16)
+ VMOVU %VZERO_128, (%rdi)
+ VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+ ret
+# endif
+L(ret_16_31):
+# ifdef USE_AS_STPCPY
+# ifdef USE_AS_WCSCPY
+ adcq $0, %rdx
+ leaq (%rdi, %rdx, CHAR_SIZE), %rax
+# else
+ movl %edx, %eax
+ adcq %rdi, %rax
+# endif
+# endif
+ ret
+# else
+ /* VEC_SIZE == 32 begins. */
+ ja L(zfill_less_vec_no_bsf)
+# ifndef USE_AS_STPCPY
+L(ret_1x_1x):
+# else
+# ifdef USE_AS_WCSCPY
+ adcq $0, %rdx
+ leaq (%rdi, %rdx, CHAR_SIZE), %rax
+# else
+ movl %edx, %eax
+ adcq %rdi, %rax
+# endif
+# endif
+ ret
+# endif
+
+
+ .p2align 4,, 4
+L(copy_8_15):
+ /* Overfill to avoid branches. */
+ movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
+ vmovq %VMM_128(0), (%rdi)
+ movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+ cmpl %ecx, %edx
+ jbe L(ret_8_15)
+ subl %ecx, %edx
+# ifdef USE_AS_WCSCPY
+ leaq (%rdi, %rcx, CHAR_SIZE), %rdi
+# else
+ addq %rcx, %rdi
+# endif
+# ifdef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+ .p2align 4,, 8
+# if VEC_SIZE == 32
+L(zfill_less_half):
+# endif
+L(zfill_less_16):
+ xorl %ecx, %ecx
+ cmpl $(8 / CHAR_SIZE), %edx
+ jb L(zfill_less_8)
+ movq %rcx, (%rdi)
+ movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+# ifndef USE_AS_STPCPY
+L(ret_8_15):
+# endif
+ ret
+
+ .p2align 4,, 8
+L(less_1x_vec):
+ je L(copy_1x)
+
+ /* We will need `tzcnt` result for all other copy sizes. */
+ tzcnt %VRCX, %VRCX
+# if VEC_SIZE == 64
+ cmpl $(32 / CHAR_SIZE), %edx
+ jae L(copy_32_63)
+# endif
+
+ cmpl $(16 / CHAR_SIZE), %edx
+ jae L(copy_16_31)
+
+ cmpl $(8 / CHAR_SIZE), %edx
+ jae L(copy_8_15)
+# ifdef USE_AS_WCSCPY
+ testl %ecx, %ecx
+ jz L(zfill_less_8_set_ret)
+
+ movl (%rsi, %rdx, CHAR_SIZE), %esi
+ vmovd %VMM_128(0), (%rdi)
+ movl %esi, (%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+ cmpl %ecx, %edx
+L(ret_8_15):
+ adcq $0, %rdx
+ leaq (%rdi, %rdx, CHAR_SIZE), %rax
+# endif
+ ret
+L(zfill_less_8_set_ret):
+ xorl %ecx, %ecx
+# ifdef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+L(zfill_less_8):
+ movl %ecx, (%rdi)
+ movl %ecx, (%rdi, %rdx, CHAR_SIZE)
+ ret
+# else
+ cmpl $3, %edx
+ jb L(copy_0_3)
+ /* Overfill to avoid branches. */
+ movl -3(%rsi, %rdx), %esi
+ vmovd %VMM_128(0), (%rdi)
+ movl %esi, -3(%rdi, %rdx)
+ cmpl %ecx, %edx
+ jbe L(ret_4_7)
+ subq %rcx, %rdx
+ addq %rcx, %rdi
+# ifdef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+ xorl %ecx, %ecx
+ .p2align 4,, 8
+L(zfill_less_8):
+ cmpl $3, %edx
+ jb L(zfill_less_3)
+ movl %ecx, (%rdi)
+ movl %ecx, -3(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+ ret
+# endif
+
+L(ret_4_7):
+# ifdef USE_AS_STPCPY
+L(ret_8_15):
+ movl %edx, %eax
+ adcq %rdi, %rax
+# endif
+ ret
+
+ .p2align 4,, 4
+L(zfill_less_3):
+ testl %edx, %edx
+ jz L(zfill_1)
+ movw %cx, (%rdi)
+L(zfill_1):
+ movb %cl, (%rdi, %rdx)
+ ret
+
+ .p2align 4,, 8
+L(copy_0_3):
+ vmovd %VMM_128(0), %r8d
+ testl %edx, %edx
+ jz L(copy_1)
+ movw %r8w, (%rdi)
+ cmpl %ecx, %edx
+ ja L(zfill_from_1)
+ movzbl (%rsi, %rdx), %r8d
+# ifdef USE_AS_STPCPY
+ movl %edx, %eax
+ adcq %rdi, %rax
+ movb %r8b, (%rdi, %rdx)
+ ret
+# endif
+
+L(copy_1):
+# ifdef USE_AS_STPCPY
+ movl %edx, %eax
+ cmpl %ecx, %edx
+ adcq %rdi, %rax
+# endif
+# ifdef USE_AS_WCSCPY
+ vmovd %VMM_128(0), (%rdi)
+# else
+ movb %r8b, (%rdi, %rdx)
+# endif
+ ret
+# endif
+
+
+# ifndef USE_AS_WCSCPY
+ .p2align 4,, 8
+L(zfill_from_1):
+# ifdef USE_AS_STPCPY
+ leaq (%rdi, %rcx), %rax
+# endif
+ movw $0, -1(%rdi, %rdx)
+ ret
+# endif
+
+ .p2align 4,, 4
+L(zero_len):
+ incq %rdx
+ jne L(best_effort_strncpy)
+ movq %rdi, %rax
+ ret
+# endif
+
+
+ .p2align 4,, 4
+ .p2align 6,, 8
+L(page_cross):
+ movq %rsi, %rax
+ andq $(VEC_SIZE * -1), %rax
+ VPCMPEQ (%rax), %VZERO, %k0
+ KMOV %k0, %VRCX
+# ifdef USE_AS_WCSCPY
+ movl %esi, %r8d
+ shrl $2, %r8d
+ andl $(CHAR_PER_VEC - 1), %r8d
+ shrx %VR8, %VRCX, %VRCX
+# else
+ shrx %VRSI, %VRCX, %VRCX
+# endif
+
+ /* Compute amount of bytes we checked. */
+ subl %esi, %eax
+ andl $(VEC_SIZE - 1), %eax
+# ifdef USE_AS_WCSCPY
+ shrl $2, %eax
+# endif
+
+ /* If rax > rdx then we are finishing the copy at the end of the
+ page. */
+ cmpq %rax, %rdx
+ jb L(page_cross_small)
+
+
+ /* If rcx is non-zero then continue. */
+ test %VRCX, %VRCX
+ jz L(page_cross_continue)
+
+ /* We found zero-CHAR so need to copy then zfill (we know we
+ didn't cover all of length here). */
+ bsf %VRCX, %VRCX
+L(movsb_and_zfill):
+ incl %ecx
+ subq %rcx, %rdx
+# ifdef USE_AS_STPCPY
+ leaq -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
+# else
+ movq %rdi, %rax
+# endif
+
+ REP_MOVS
+# ifdef USE_AS_WCSCPY
+ movl $0, (%rdi)
+# else
+ movb $0, (%rdi)
+# endif
+ jmp L(zfill_from_page_cross)
+
+L(page_cross_small):
+ tzcnt %VRCX, %VRCX
+ cmpl %ecx, %edx
+ jbe L(page_cross_copy_only)
+
+ /* Do a zfill of the tail before copying. */
+ movq %rdi, %r9
+ xorl %eax, %eax
+
+ movl %ecx, %r8d
+
+ subl %ecx, %edx
+ leaq CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
+ movl %edx, %ecx
+ REP_STOS
+ movq %r9, %rdi
+ movl %r8d, %edx
+L(page_cross_copy_only):
+ leal 1(%rdx), %ecx
+# ifdef USE_AS_STPCPY
+# ifdef USE_AS_WCSCPY
+ adcl $0, %edx
+ leaq (%rdi, %rdx, CHAR_SIZE), %rax
+# else
+ movl %edx, %eax
+ adcq %rdi, %rax
+# endif
+# else
+ movq %rdi, %rax
+# endif
+ REP_MOVS
+ ret
+
+
+L(best_effort_strncpy):
+ movq %rdx, %rcx
+ xorl %eax, %eax
+ movq %rdi, %r8
+ /* The length is >= 2^63. We very much so expect to segfault at
+ rep stos. If that doesn't happen then just strcpy to finish.
+ */
+ REP_STOS
+ movq %r8, %rdi
+ jmp OVERFLOW_STRCPY
+END(STRNCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
new file mode 100644
index 0000000000..d4f4d6c82b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h
@@ -0,0 +1,80 @@
+/* Helper for getting proper name of overflow fallback function for
+ {wc|st}{p|r|s}n{cat|cpy}
+
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_
+#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1
+
+#if defined USE_MULTIARCH && IS_IN(libc)
+# define UNDERSCORES __
+# ifdef USE_WITH_SSE2
+# define ISA_EXT _sse2
+# elif defined USE_WITH_AVX2
+# ifdef USE_WITH_RTM
+# define ISA_EXT _avx2_rtm
+# else
+# define ISA_EXT _avx2
+# endif
+
+# elif defined USE_WITH_EVEX256
+# define ISA_EXT _evex
+# elif defined USE_WITH_EVEX512
+# define ISA_EXT _evex512
+# endif
+#else
+# define UNDERSCORES
+# define ISA_EXT
+#endif
+
+#ifdef USE_AS_WCSCPY
+# define STRCPY_PREFIX wc
+# define STRCAT_PREFIX wcs
+# ifdef USE_AS_STPCPY
+# define STRCPY_POSTFIX pcpy
+# else
+# define STRCPY_POSTFIX scpy
+# endif
+#else
+# define STRCPY_PREFIX st
+# define STRCAT_PREFIX str
+# ifdef USE_AS_STPCPY
+# define STRCPY_POSTFIX pcpy
+# else
+# define STRCPY_POSTFIX rcpy
+# endif
+#endif
+#define STRCAT_POSTFIX cat
+
+#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext) \
+ underscores##prefix##postfix##ext
+
+#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__)
+
+#ifndef OVERFLOW_STRCPY
+# define OVERFLOW_STRCPY \
+ OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT)
+#endif
+
+#ifndef OVERFLOW_STRCAT
+# define OVERFLOW_STRCAT \
+ OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT)
+#endif
+
+#endif