1 files changed, 989 insertions, 6 deletions
diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
index 1b3426d511..49eaf4cbd9 100644
--- a/sysdeps/x86_64/multiarch/strncpy-evex.S
+++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
@@ -1,7 +1,990 @@
-#ifndef STRNCPY
-# define STRNCPY	__strncpy_evex
-#endif
+/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+	/* Use evex-masked stores for small sizes. Turned off at the
+	   moment.  */
+# define USE_EVEX_MASKED_STORE	0
+
+
+# include <sysdep.h>
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+
+# ifndef STRNCPY
+#  define STRNCPY	__strncpy_evex
+# endif
+
+# ifdef USE_AS_WCSCPY
+#  define VMOVU_MASK	vmovdqu32
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define CHAR_SIZE	4
+
+#  define REP_MOVS	rep movsd
+#  define REP_STOS	rep stosl
+
+#  define USE_WIDE_CHAR
+
+# else
+#  define VMOVU_MASK	vmovdqu8
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define CHAR_SIZE	1
+
+#  define REP_MOVS	rep movsb
+#  define REP_STOS	rep stosb
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# include "reg-macros.h"
+
+
+# define VZERO	VMM(7)
+# define VZERO_256	VMM_256(7)
+# define VZERO_128	VMM_128(7)
+
+# if VEC_SIZE == 64
+#  define VZERO_HALF	VZERO_256
+# else
+#  define VZERO_HALF	VZERO_128
+# endif
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRNCPY)
+	/* Filter zero length strings and very long strings.  Zero
+	   length strings just return, very long strings are handled by
+	   just running rep stos{b|l} to zero set (which will almost
+	   certainly segfault), if that succeeds then just calling
+	   OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy).  */
+# ifdef USE_AS_WCSCPY
+	decq	%rdx
+	movq	%rdx, %rax
+	/* 56 is end of max supported address space.  */
+	shr	$56, %rax
+	jnz	L(zero_len)
+# else
+	decq	%rdx
+	/* If the flag needs to become `jb` replace `dec` with `sub`.
+	 */
+	jl	L(zero_len)
+# endif
+
+	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
+	movl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+L(page_cross_continue):
+	VMOVU	(%rsi), %VMM(0)
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+
+	/* If no STPCPY just save end ahead of time.  */
+# ifndef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+
+
+	cmpq	$(CHAR_PER_VEC), %rdx
+
+	/* If USE_EVEX_MASK_STORE is enabled then we just handle length
+	   <= CHAR_PER_VEC with masked instructions (which have
+	   potential for dramatically bad perf if dst splits a page and
+	   is not in the TLB).  */
+# if USE_EVEX_MASKED_STORE
+	/* `jae` because length rdx is now length - 1.  */
+	jae	L(more_1x_vec)
+
+	/* If there where multiple zero-CHAR matches in the first VEC,
+	   VRCX will be overset but thats fine since any oversets where
+	   at zero-positions anyways.  */
+
+#  ifdef USE_AS_STPCPY
+	tzcnt	%VRCX, %VRAX
+	cmpl	%eax, %edx
+	cmovb	%edx, %eax
+#   ifdef USE_AS_WCSCPY
+	adcl	$0, %eax
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#   else
+	adcq	%rdi, %rax
+#   endif
+#  endif
+	dec	%VRCX
+
+	/* Zero out all non-zero CHAR's after the first zero match.  */
+	KMOV	%VRCX, %k1
+
+	/* Use VZERO as destination so this can be reused for
+	   L(zfill_less_vec) (which if jumped to by subsequent logic
+	   will have zerod out VZERO.  */
+	VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
+L(zfill_less_vec):
+	/* Get mask for what we need to set.  */
+	incl	%edx
+	mov	$-1, %VRCX
+	bzhi	%VRDX, %VRCX, %VRCX
+	KMOV	%VRCX, %k1
+	VMOVU_MASK %VZERO, (%rdi){%k1}
+	ret
+
+	.p2align 4,, 4
+L(zero_len):
+	cmpq	$-1, %rdx
+	jne	L(best_effort_strncpy)
+	movq	%rdi, %rax
+	ret
+
+	.p2align 4,, 8
+L(more_1x_vec):
+# else
+	/* `jb` because length rdx is now length - 1.  */
+	jb	L(less_1x_vec)
+# endif
+
+
+	/* This may overset but thats fine because we still need to zero
+	   fill.  */
+	VMOVU	%VMM(0), (%rdi)
+
+
+	/* Length must be >= CHAR_PER_VEC so match here means we must
+	   zero-fill.  */
+	test	%VRCX, %VRCX
+	jnz	L(zfill)
+
+
+	/* We are going to align rsi here so will need to be able to re-
+	   adjust rdi/rdx afterwords. NB: We filtered out huge lengths
+	   so rsi + rdx * CHAR_SIZE cannot overflow.  */
+	leaq	(VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
+	subq	%rsi, %rdi
+	andq	$-(VEC_SIZE), %rsi
+
+L(loop_last_4x_vec):
+	addq	%rsi, %rdi
+	subq	%rsi, %rdx
+# ifdef USE_AS_WCSCPY
+	shrq	$2, %rdx
+# endif
+
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRCX
+
+	/* -1 because of the `dec %rdx` earlier.  */
+	cmpq	$(CHAR_PER_VEC * 2 - 1), %rdx
+	ja	L(more_2x_vec)
+
+L(last_2x_vec):
+	/* This will be need to be computed no matter what. We do it
+	   ahead of time for CHAR_PER_VEC == 64 because we can't adjust
+	   the value of `tzcnt` with a shift.  */
+# if CHAR_PER_VEC == 64
+	tzcntq	%rcx, %rcx
+# endif
+
+	cmpl	$(CHAR_PER_VEC), %edx
+	jb	L(ret_vec_x1_len)
+
+	/* Seperate logic for CHAR_PER_VEC == 64 because we already did
+	   `tzcnt` on VRCX.  */
+# if CHAR_PER_VEC == 64
+	/* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`.  */
+	cmpb	$CHAR_PER_VEC, %cl
+	jnz	L(ret_vec_x1_no_bsf)
+# else
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+# endif
+
+
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %VZERO, %k0
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	KMOV	%k0, %VRCX
+
+# if CHAR_PER_VEC < 64
+	/* This essentiallys adds CHAR_PER_VEC to computed result.  */
+	shlq	$CHAR_PER_VEC, %rcx
+# else
+	tzcntq	%rcx, %rcx
+	addl	$CHAR_PER_VEC, %ecx
+# endif
+
+	.p2align 4,, 4
+L(ret_vec_x1_len):
+	/* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
+	   already been done.  */
+# if CHAR_PER_VEC < 64
+	tzcntq	%rcx, %rcx
+# endif
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x1_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+L(ret_vec_x1_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+L(ret_vec_x1_len_no_zfill):
+	VMOVU	((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	ret
+
+
+	.p2align 4,, 10
+L(ret_vec_x1):
+	bsf	%VRCX, %VRCX
+L(ret_vec_x1_no_bsf):
+	VMOVU	%VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	subl	%ecx, %edx
+	cmpl	$CHAR_PER_VEC, %edx
+	jb	L(ret_vec_x1_len_no_zfill_mov)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+	leaq	(VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
+# endif
+	ret
+
+	.p2align 4,, 8
+L(last_4x_vec):
+	/* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl
+	   $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
+	   using `movzbl`.  */
+# if CHAR_PER_VEC == 64
+	movzbl	%dl, %edx
+# else
+	andl	$(CHAR_PER_VEC * 4 - 1), %edx
+# endif
+	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRCX
+	subq	$-(VEC_SIZE * 4), %rsi
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(CHAR_PER_VEC * 2 - 1), %edx
+	jbe	L(last_2x_vec)
+	.p2align 4,, 8
+L(more_2x_vec):
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+	test	%VRCX, %VRCX
+	/* Must fill at least 2x VEC.  */
+	jnz	L(zfill_vec1)
+
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(2)
+	VMOVU	%VMM(2), (VEC_SIZE * 2)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	/* Must fill at least 1x VEC.  */
+	jnz	L(zfill_vec2)
+
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(3)
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VRCX
+
+	/* Check if len is more 4x VEC. -1 because rdx is len - 1.  */
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rdx
+	ja	L(more_4x_vec)
+
+	subl	$(CHAR_PER_VEC * 3), %edx
+	jb	L(ret_vec_x3_len)
+
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rsi), %VZERO, %k0
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	KMOV	%k0, %VRCX
+	tzcnt	%VRCX, %VRCX
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x4_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+	movl	%ecx, %edx
+L(ret_vec_x4_len_no_zfill):
+	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 4 + 0)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	ret
+
+
+L(ret_vec_x3_len):
+	addl	$(CHAR_PER_VEC * 1), %edx
+	tzcnt	%VRCX, %VRCX
+	cmpl	%ecx, %edx
+	jbe	L(ret_vec_x3_len_no_zfill)
+	/* Fall through (expectation) is copy len < buffer len.  */
+	VMOVU	%VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+L(ret_vec_x3_len_no_zfill_mov):
+	movl	%ecx, %edx
+# ifdef USE_AS_STPCPY
+	/* clear flags.  */
+	xorl	%ecx, %ecx
+# endif
+	.p2align 4,, 4
+L(ret_vec_x3_len_no_zfill):
+	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
+	VMOVU	%VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	leal	(VEC_SIZE * 3 + 0)(%rdx), %eax
+	adcq	%rdi, %rax
+#  endif
+# endif
+	ret
+
+
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsf	%VRCX, %VRCX
+	VMOVU	%VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
+	subl	%ecx, %edx
+	jl	L(ret_vec_x3_len_no_zfill_mov)
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
+# ifdef USE_AS_STPCPY
+	leaq	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
+# endif
+	ret
+
+	.p2align 4,, 8
+L(more_4x_vec):
+	VMOVU	%VMM(3), (VEC_SIZE * 3)(%rdi)
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec3)
+
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(4)
+	VMOVU	%VMM(4), (VEC_SIZE * 4)(%rdi)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec4)
 
-#define USE_AS_STRNCPY
-#define STRCPY	STRNCPY
-#include "strcpy-evex.S"
+	/* Recheck length before aligning.  */
+	cmpq	$(CHAR_PER_VEC * 8 - 1), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi.  */
+# ifdef USE_AS_WCSCPY
+	leaq	(%rsi, %rdx, CHAR_SIZE), %rdx
+# else
+	addq	%rsi, %rdx
+# endif
+	subq	%rsi, %rdi
+	subq	$-(VEC_SIZE * 5), %rsi
+	andq	$(VEC_SIZE * -4), %rsi
+
+
+	/* Load first half of the loop before entry.  */
+	VMOVA	(VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+
+
+	/* Offset rsi by VEC_SIZE so that we can jump to
+	   L(loop_last_4x_vec).  */
+	addq	$-(VEC_SIZE), %rsi
+	KORTEST	%k2, %k4
+	jnz	L(loop_4x_done)
+
+	/* Store loop end in r9.  */
+	leaq	-(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
+	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
+
+	subq	$(VEC_SIZE * -4), %rsi
+	cmpq	%rsi, %r9
+	jbe	L(loop_last_4x_vec)
+
+	VMOVA	(VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
+	VMOVA	(VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
+
+	VPMIN	%VMM(0), %VMM(1), %VMM(4)
+	VPMIN	%VMM(2), %VMM(3), %VMM(6)
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	VPTESTN	%VMM(6), %VMM(6), %k4
+	KORTEST	%k2, %k4
+	jz	L(loop_4x_vec)
+
+L(loop_4x_done):
+	/* Restore rdx (length).  */
+	subq	%rsi, %rdx
+# ifdef USE_AS_WCSCPY
+	shrq	$2, %rdx
+# endif
+	VMOVU	%VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
+	/* Restore rdi (dst).  */
+	addq	%rsi, %rdi
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec1)
+
+	VMOVU	%VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec2)
+
+	VMOVU	%VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(zfill_vec3)
+
+	VMOVU	%VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
+	KMOV	%k4, %VRCX
+	// Zfill more....
+
+	.p2align 4,, 4
+L(zfill_vec4):
+	subq	$(VEC_SIZE * -2), %rdi
+	addq	$(CHAR_PER_VEC * -2), %rdx
+L(zfill_vec2):
+	subq	$(VEC_SIZE * -2), %rdi
+	addq	$(CHAR_PER_VEC * -1), %rdx
+L(zfill):
+	/* VRCX must be non-zero.  */
+	bsf	%VRCX, %VRCX
+
+	/* Adjust length / dst for zfill.  */
+	subq	%rcx, %rdx
+# ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+# else
+	addq	%rcx, %rdi
+# endif
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+L(zfill_from_page_cross):
+
+	/* From here on out its just memset(rdi, 0, rdx).  */
+	cmpq	$CHAR_PER_VEC, %rdx
+	jb	L(zfill_less_vec)
+
+L(zfill_more_1x_vec):
+	VMOVU	%VZERO, (%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpq	$(CHAR_PER_VEC * 2 - 1), %rdx
+	ja	L(zfill_more_2x_vec)
+L(zfill_done0):
+	ret
+
+	/* Coming from vec1/vec2 we must be able to zfill at least 2x
+	   VEC.  */
+	.p2align 4,, 8
+L(zfill_vec3):
+	subq	$(VEC_SIZE * -2), %rdi
+	addq	$(CHAR_PER_VEC * -2), %rdx
+	.p2align 4,, 2
+L(zfill_vec1):
+	bsfq	%rcx, %rcx
+	/* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
+	 */
+	leaq	VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+# endif
+
+
+	VMOVU	%VZERO, (%rdi)
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
+	jb	L(zfill_done0)
+L(zfill_more_2x_vec):
+	VMOVU	%VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
+	VMOVU	%VZERO, (VEC_SIZE)(%rdi)
+	subq	$(CHAR_PER_VEC * 4 - 1), %rdx
+	jbe	L(zfill_done)
+
+# ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rdx
+# else
+	addq	%rdi, %rdx
+# endif
+
+	VMOVU	%VZERO, (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VZERO, (VEC_SIZE * 3)(%rdi)
+
+
+	VMOVU	%VZERO, (VEC_SIZE * 0 + 0)(%rdx)
+	VMOVU	%VZERO, (VEC_SIZE * 1 + 0)(%rdx)
+
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdi, %rdx
+	jbe	L(zfill_done)
+
+	/* Align rdi and zfill loop.  */
+	andq	$-(VEC_SIZE), %rdi
+	.p2align 4,, 12
+L(zfill_loop_4x_vec):
+	VMOVA	%VZERO, (VEC_SIZE * 0)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 1)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VZERO, (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdi, %rdx
+	ja	L(zfill_loop_4x_vec)
+L(zfill_done):
+	ret
+
+
+	/* Less 1x VEC case if we are not using evex masked store.  */
+# if !USE_EVEX_MASKED_STORE
+	.p2align 4,, 8
+L(copy_1x):
+	/* Special case for copy 1x. It can be handled quickly and many
+	   buffer sizes have convenient alignment.  */
+	VMOVU	%VMM(0), (%rdi)
+	/* If no zeros then we are done.  */
+	testl	%ecx, %ecx
+	jz	L(ret_1x_1x)
+
+	/* Need to zfill, not we know that length <= CHAR_PER_VEC so we
+	   only handle the small case here.  */
+	bsf	%VRCX, %VRCX
+L(zfill_less_vec_no_bsf):
+	/* Adjust length / dst then just zfill less_vec.  */
+	subq	%rcx, %rdx
+#  ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#  else
+	addq	%rcx, %rdi
+#  endif
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+
+L(zfill_less_vec):
+	cmpl	$((VEC_SIZE / 2) / CHAR_SIZE), %edx
+	jb	L(zfill_less_half)
+
+	VMOVU	%VZERO_HALF, (%rdi)
+	VMOVU	%VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	ret
+#  ifdef USE_AS_STPCPY
+L(ret_1x_1x):
+	leaq	CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
+	ret
+#  endif
+
+
+#  if VEC_SIZE == 64
+	.p2align 4,, 4
+L(copy_32_63):
+	/* Overfill to avoid branches.  */
+	VMOVU	-(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
+	VMOVU	%VMM_256(0), (%rdi)
+	VMOVU	%VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+
+	/* We are taking advantage of the fact that to be here we must
+	   be writing null-term as (%rdi, %rcx) we have a byte of lee-
+	   way for overwriting.  */
+	cmpl	%ecx, %edx
+	ja	L(zfill_less_vec_no_bsf)
+#   ifndef USE_AS_STPCPY
+L(ret_1x_1x):
+#   else
+#    ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#    else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#    endif
+#   endif
+	ret
+#  endif
+
+	.p2align 4,, 4
+L(copy_16_31):
+	/* Overfill to avoid branches.  */
+	vmovdqu	-(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
+	VMOVU	%VMM_128(0), (%rdi)
+	vmovdqu	%xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpl	%ecx, %edx
+
+	/* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
+	   we have a larger copy block for 32-63 so this is just falls
+	   through to zfill 16-31. If VEC_SIZE == 32 then we check for
+	   full zfill of less 1x VEC.  */
+#  if VEC_SIZE == 64
+	jbe	L(ret_16_31)
+	subl	%ecx, %edx
+#   ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#   else
+	addq	%rcx, %rdi
+#   endif
+#   ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#   endif
+L(zfill_less_half):
+L(zfill_less_32):
+	cmpl	$(16 / CHAR_SIZE), %edx
+	jb	L(zfill_less_16)
+	VMOVU	%VZERO_128, (%rdi)
+	VMOVU	%VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+#   ifdef USE_AS_STPCPY
+	ret
+#   endif
+L(ret_16_31):
+#   ifdef USE_AS_STPCPY
+#    ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#    else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#    endif
+#   endif
+	ret
+#  else
+	/* VEC_SIZE == 32 begins.  */
+	ja	L(zfill_less_vec_no_bsf)
+#   ifndef USE_AS_STPCPY
+L(ret_1x_1x):
+#   else
+#    ifdef USE_AS_WCSCPY
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#    else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#    endif
+#   endif
+	ret
+#  endif
+
+
+	.p2align 4,, 4
+L(copy_8_15):
+	/* Overfill to avoid branches.  */
+	movq	-(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
+	vmovq	%VMM_128(0), (%rdi)
+	movq	%rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	cmpl	%ecx, %edx
+	jbe	L(ret_8_15)
+	subl	%ecx, %edx
+#  ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#  else
+	addq	%rcx, %rdi
+#  endif
+#  ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#  endif
+	.p2align 4,, 8
+#  if VEC_SIZE == 32
+L(zfill_less_half):
+#  endif
+L(zfill_less_16):
+	xorl	%ecx, %ecx
+	cmpl	$(8 / CHAR_SIZE), %edx
+	jb	L(zfill_less_8)
+	movq	%rcx, (%rdi)
+	movq	%rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+#  ifndef USE_AS_STPCPY
+L(ret_8_15):
+#  endif
+	ret
+
+	.p2align 4,, 8
+L(less_1x_vec):
+	je	L(copy_1x)
+
+	/* We will need `tzcnt` result for all other copy sizes.  */
+	tzcnt	%VRCX, %VRCX
+#  if VEC_SIZE == 64
+	cmpl	$(32 / CHAR_SIZE), %edx
+	jae	L(copy_32_63)
+#  endif
+
+	cmpl	$(16 / CHAR_SIZE), %edx
+	jae	L(copy_16_31)
+
+	cmpl	$(8 / CHAR_SIZE), %edx
+	jae	L(copy_8_15)
+#  ifdef USE_AS_WCSCPY
+	testl	%ecx, %ecx
+	jz	L(zfill_less_8_set_ret)
+
+	movl	(%rsi, %rdx, CHAR_SIZE), %esi
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%esi, (%rdi, %rdx, CHAR_SIZE)
+#   ifdef USE_AS_STPCPY
+	cmpl	%ecx, %edx
+L(ret_8_15):
+	adcq	$0, %rdx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#   endif
+	ret
+L(zfill_less_8_set_ret):
+	xorl	%ecx, %ecx
+#   ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#   endif
+L(zfill_less_8):
+	movl	%ecx, (%rdi)
+	movl	%ecx, (%rdi, %rdx, CHAR_SIZE)
+	ret
+#  else
+	cmpl	$3, %edx
+	jb	L(copy_0_3)
+	/* Overfill to avoid branches.  */
+	movl	-3(%rsi, %rdx), %esi
+	vmovd	%VMM_128(0), (%rdi)
+	movl	%esi, -3(%rdi, %rdx)
+	cmpl	%ecx, %edx
+	jbe	L(ret_4_7)
+	subq	%rcx, %rdx
+	addq	%rcx, %rdi
+#   ifdef USE_AS_STPCPY
+	movq	%rdi, %rax
+#   endif
+	xorl	%ecx, %ecx
+	.p2align 4,, 8
+L(zfill_less_8):
+	cmpl	$3, %edx
+	jb	L(zfill_less_3)
+	movl	%ecx, (%rdi)
+	movl	%ecx, -3(%rdi, %rdx)
+#   ifdef USE_AS_STPCPY
+	ret
+#   endif
+
+L(ret_4_7):
+#   ifdef USE_AS_STPCPY
+L(ret_8_15):
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#   endif
+	ret
+
+	.p2align 4,, 4
+L(zfill_less_3):
+	testl	%edx, %edx
+	jz	L(zfill_1)
+	movw	%cx, (%rdi)
+L(zfill_1):
+	movb	%cl, (%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+L(copy_0_3):
+	vmovd	%VMM_128(0), %r8d
+	testl	%edx, %edx
+	jz	L(copy_1)
+	movw	%r8w, (%rdi)
+	cmpl	%ecx, %edx
+	ja	L(zfill_from_1)
+	movzbl	(%rsi, %rdx), %r8d
+#   ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+	movb	%r8b, (%rdi, %rdx)
+	ret
+#   endif
+
+L(copy_1):
+#   ifdef USE_AS_STPCPY
+	movl	%edx, %eax
+	cmpl	%ecx, %edx
+	adcq	%rdi, %rax
+#   endif
+#   ifdef USE_AS_WCSCPY
+	vmovd	%VMM_128(0), (%rdi)
+#   else
+	movb	%r8b, (%rdi, %rdx)
+#   endif
+	ret
+#  endif
+
+
+#  ifndef USE_AS_WCSCPY
+	.p2align 4,, 8
+L(zfill_from_1):
+#   ifdef USE_AS_STPCPY
+	leaq	(%rdi, %rcx), %rax
+#   endif
+	movw	$0, -1(%rdi, %rdx)
+	ret
+#  endif
+
+	.p2align 4,, 4
+L(zero_len):
+	incq	%rdx
+	jne	L(best_effort_strncpy)
+	movq	%rdi, %rax
+	ret
+# endif
+
+
+	.p2align 4,, 4
+	.p2align 6,, 8
+L(page_cross):
+	movq	%rsi, %rax
+	andq	$(VEC_SIZE * -1), %rax
+	VPCMPEQ	(%rax), %VZERO, %k0
+	KMOV	%k0, %VRCX
+# ifdef USE_AS_WCSCPY
+	movl	%esi, %r8d
+	shrl	$2, %r8d
+	andl	$(CHAR_PER_VEC - 1), %r8d
+	shrx	%VR8, %VRCX, %VRCX
+# else
+	shrx	%VRSI, %VRCX, %VRCX
+# endif
+
+	/* Compute amount of bytes we checked.  */
+	subl	%esi, %eax
+	andl	$(VEC_SIZE - 1), %eax
+# ifdef USE_AS_WCSCPY
+	shrl	$2, %eax
+# endif
+
+	/* If rax > rdx then we are finishing the copy at the end of the
+	   page.  */
+	cmpq	%rax, %rdx
+	jb	L(page_cross_small)
+
+
+	/* If rcx is non-zero then continue.  */
+	test	%VRCX, %VRCX
+	jz	L(page_cross_continue)
+
+	/* We found zero-CHAR so need to copy then zfill (we know we
+	   didn't cover all of length here).  */
+	bsf	%VRCX, %VRCX
+L(movsb_and_zfill):
+	incl	%ecx
+	subq	%rcx, %rdx
+# ifdef USE_AS_STPCPY
+	leaq	-CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
+# else
+	movq	%rdi, %rax
+# endif
+
+	REP_MOVS
+# ifdef USE_AS_WCSCPY
+	movl	$0, (%rdi)
+# else
+	movb	$0, (%rdi)
+# endif
+	jmp	L(zfill_from_page_cross)
+
+L(page_cross_small):
+	tzcnt	%VRCX, %VRCX
+	cmpl	%ecx, %edx
+	jbe	L(page_cross_copy_only)
+
+	/* Do a zfill of the tail before copying.  */
+	movq	%rdi, %r9
+	xorl	%eax, %eax
+
+	movl	%ecx, %r8d
+
+	subl	%ecx, %edx
+	leaq	CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
+	movl	%edx, %ecx
+	REP_STOS
+	movq	%r9, %rdi
+	movl	%r8d, %edx
+L(page_cross_copy_only):
+	leal	1(%rdx), %ecx
+# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_WCSCPY
+	adcl	$0, %edx
+	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
+#  else
+	movl	%edx, %eax
+	adcq	%rdi, %rax
+#  endif
+# else
+	movq	%rdi, %rax
+# endif
+	REP_MOVS
+	ret
+
+
+L(best_effort_strncpy):
+	movq	%rdx, %rcx
+	xorl	%eax, %eax
+	movq	%rdi, %r8
+	/* The length is >= 2^63. We very much so expect to segfault at
+	   rep stos. If that doesn't happen then just strcpy to finish.
+	 */
+	REP_STOS
+	movq	%r8, %rdi
+	jmp	OVERFLOW_STRCPY
+END(STRNCPY)
+#endif