aboutsummaryrefslogtreecommitdiff
path: root/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S
diff options
context:
space:
mode:
Diffstat (limited to 'REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S')
-rw-r--r--REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S3551
1 files changed, 3551 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S
new file mode 100644
index 0000000000..47aaeae671
--- /dev/null
+++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S
@@ -0,0 +1,3551 @@
+/* strcpy with SSSE3
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+# include <sysdep.h>
+
+# ifndef STRCPY
+# define STRCPY __strcpy_ssse3
+# endif
+
+ .section .text.ssse3,"ax",@progbits
+ENTRY (STRCPY)
+
+ mov %rsi, %rcx
+# ifdef USE_AS_STRNCPY
+ mov %rdx, %r8
+# endif
+ mov %rdi, %rdx
+# ifdef USE_AS_STRNCPY
+ test %r8, %r8
+ jz L(Exit0)
+ cmp $8, %r8
+ jbe L(StrncpyExit8Bytes)
+# endif
+ cmpb $0, (%rcx)
+ jz L(Exit1)
+ cmpb $0, 1(%rcx)
+ jz L(Exit2)
+ cmpb $0, 2(%rcx)
+ jz L(Exit3)
+ cmpb $0, 3(%rcx)
+ jz L(Exit4)
+ cmpb $0, 4(%rcx)
+ jz L(Exit5)
+ cmpb $0, 5(%rcx)
+ jz L(Exit6)
+ cmpb $0, 6(%rcx)
+ jz L(Exit7)
+ cmpb $0, 7(%rcx)
+ jz L(Exit8)
+# ifdef USE_AS_STRNCPY
+ cmp $16, %r8
+ jb L(StrncpyExit15Bytes)
+# endif
+ cmpb $0, 8(%rcx)
+ jz L(Exit9)
+ cmpb $0, 9(%rcx)
+ jz L(Exit10)
+ cmpb $0, 10(%rcx)
+ jz L(Exit11)
+ cmpb $0, 11(%rcx)
+ jz L(Exit12)
+ cmpb $0, 12(%rcx)
+ jz L(Exit13)
+ cmpb $0, 13(%rcx)
+ jz L(Exit14)
+ cmpb $0, 14(%rcx)
+ jz L(Exit15)
+# ifdef USE_AS_STRNCPY
+ cmp $16, %r8
+ je L(Exit16)
+# endif
+ cmpb $0, 15(%rcx)
+ jz L(Exit16)
+# endif
+
+# ifdef USE_AS_STRNCPY
+ mov %rcx, %rsi
+ sub $16, %r8
+ and $0xf, %rsi
+
+/* add 16 bytes rcx_offset to r8 */
+
+ add %rsi, %r8
+# endif
+ lea 16(%rcx), %rsi
+ and $-16, %rsi
+ pxor %xmm0, %xmm0
+ mov (%rcx), %r9
+ mov %r9, (%rdx)
+ pcmpeqb (%rsi), %xmm0
+ mov 8(%rcx), %r9
+ mov %r9, 8(%rdx)
+
+/* convert byte mask in xmm0 to bit mask */
+
+ pmovmskb %xmm0, %rax
+ sub %rcx, %rsi
+
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ mov %rdx, %rax
+ lea 16(%rdx), %rdx
+ and $-16, %rdx
+ sub %rdx, %rax
+
+# ifdef USE_AS_STRNCPY
+ add %rax, %rsi
+ lea -1(%rsi), %rsi
+ and $1<<31, %esi
+ test %rsi, %rsi
+ jnz L(ContinueCopy)
+ lea 16(%r8), %r8
+
+L(ContinueCopy):
+# endif
+ sub %rax, %rcx
+ mov %rcx, %rax
+ and $0xf, %rax
+ mov $0, %rsi
+
+/* case: rcx_offset == rdx_offset */
+
+ jz L(Align16Both)
+
+ cmp $8, %rax
+ jae L(ShlHigh8)
+ cmp $1, %rax
+ je L(Shl1)
+ cmp $2, %rax
+ je L(Shl2)
+ cmp $3, %rax
+ je L(Shl3)
+ cmp $4, %rax
+ je L(Shl4)
+ cmp $5, %rax
+ je L(Shl5)
+ cmp $6, %rax
+ je L(Shl6)
+ jmp L(Shl7)
+
+L(ShlHigh8):
+ je L(Shl8)
+ cmp $9, %rax
+ je L(Shl9)
+ cmp $10, %rax
+ je L(Shl10)
+ cmp $11, %rax
+ je L(Shl11)
+ cmp $12, %rax
+ je L(Shl12)
+ cmp $13, %rax
+ je L(Shl13)
+ cmp $14, %rax
+ je L(Shl14)
+ jmp L(Shl15)
+
+L(Align16Both):
+ movaps (%rcx), %xmm1
+ movaps 16(%rcx), %xmm2
+ movaps %xmm1, (%rdx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%rcx, %rsi), %xmm3
+ movaps %xmm2, (%rdx, %rsi)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%rcx, %rsi), %xmm4
+ movaps %xmm3, (%rdx, %rsi)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%rcx, %rsi), %xmm1
+ movaps %xmm4, (%rdx, %rsi)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%rcx, %rsi), %xmm2
+ movaps %xmm1, (%rdx, %rsi)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%rcx, %rsi), %xmm3
+ movaps %xmm2, (%rdx, %rsi)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm3, (%rdx, %rsi)
+ mov %rcx, %rax
+ lea 16(%rcx, %rsi), %rcx
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ lea 112(%r8, %rax), %r8
+# endif
+ mov $-0x40, %rsi
+
+ .p2align 4
+L(Aligned64Loop):
+ movaps (%rcx), %xmm2
+ movaps %xmm2, %xmm4
+ movaps 16(%rcx), %xmm5
+ movaps 32(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 48(%rcx), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %rax
+ lea 64(%rdx), %rdx
+ lea 64(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeaveCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Aligned64Leave)
+ movaps %xmm4, -64(%rdx)
+ movaps %xmm5, -48(%rdx)
+ movaps %xmm6, -32(%rdx)
+ movaps %xmm7, -16(%rdx)
+ jmp L(Aligned64Loop)
+
+L(Aligned64Leave):
+# ifdef USE_AS_STRNCPY
+ lea 48(%r8), %r8
+# endif
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %rax
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm5, %xmm0
+# ifdef USE_AS_STRNCPY
+ lea -16(%r8), %r8
+# endif
+ pmovmskb %xmm0, %rax
+ movaps %xmm4, -64(%rdx)
+ test %rax, %rax
+ lea 16(%rsi), %rsi
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm6, %xmm0
+# ifdef USE_AS_STRNCPY
+ lea -16(%r8), %r8
+# endif
+ pmovmskb %xmm0, %rax
+ movaps %xmm5, -48(%rdx)
+ test %rax, %rax
+ lea 16(%rsi), %rsi
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm6, -32(%rdx)
+ pcmpeqb %xmm7, %xmm0
+# ifdef USE_AS_STRNCPY
+ lea -16(%r8), %r8
+# endif
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl1):
+ movaps -1(%rcx), %xmm1
+ movaps 15(%rcx), %xmm2
+L(Shl1Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 31(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 31(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 31(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ lea 31(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -15(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -1(%rcx), %xmm1
+
+/* 64 bytes loop */
+ .p2align 4
+L(Shl1LoopStart):
+ movaps 15(%rcx), %xmm2
+ movaps 31(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 47(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 63(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $1, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $1, %xmm3, %xmm4
+ jnz L(Shl1Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave1)
+# endif
+ palignr $1, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl1LoopStart)
+
+L(Shl1LoopExit):
+ movdqu -1(%rcx), %xmm1
+ mov $15, %rsi
+ movdqu %xmm1, -1(%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl2):
+ movaps -2(%rcx), %xmm1
+ movaps 14(%rcx), %xmm2
+L(Shl2Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 30(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 30(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 30(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ lea 30(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -14(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -2(%rcx), %xmm1
+
+/* 64 bytes loop */
+ .p2align 4
+L(Shl2LoopStart):
+ movaps 14(%rcx), %xmm2
+ movaps 30(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 46(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 62(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $2, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $2, %xmm3, %xmm4
+ jnz L(Shl2Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave2)
+# endif
+ palignr $2, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl2LoopStart)
+
+L(Shl2LoopExit):
+ movdqu -2(%rcx), %xmm1
+ mov $14, %rsi
+ movdqu %xmm1, -2(%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl3):
+ movaps -3(%rcx), %xmm1
+ movaps 13(%rcx), %xmm2
+L(Shl3Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 29(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 29(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 29(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ lea 29(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -13(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -3(%rcx), %xmm1
+
+/* 64 bytes loop */
+ .p2align 4
+L(Shl3LoopStart):
+ movaps 13(%rcx), %xmm2
+ movaps 29(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 45(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 61(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $3, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $3, %xmm3, %xmm4
+ jnz L(Shl3Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave3)
+# endif
+ palignr $3, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl3LoopStart)
+
+L(Shl3LoopExit):
+ movdqu -3(%rcx), %xmm1
+ mov $13, %rsi
+ movdqu %xmm1, -3(%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl4):
+ movaps -4(%rcx), %xmm1
+ movaps 12(%rcx), %xmm2
+L(Shl4Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 28(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 28(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 28(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ lea 28(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -12(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -4(%rcx), %xmm1
+
+/* 64 bytes loop */
+ .p2align 4
+L(Shl4LoopStart):
+ movaps 12(%rcx), %xmm2
+ movaps 28(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 44(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 60(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $4, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $4, %xmm3, %xmm4
+ jnz L(Shl4Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave4)
+# endif
+ palignr $4, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+ movdqu -4(%rcx), %xmm1
+ mov $12, %rsi
+ movdqu %xmm1, -4(%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl5):
+ movaps -5(%rcx), %xmm1
+ movaps 11(%rcx), %xmm2
+L(Shl5Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 27(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 27(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 27(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ lea 27(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -11(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -5(%rcx), %xmm1
+
+/* 64 bytes loop */
+ .p2align 4
+L(Shl5LoopStart):
+ movaps 11(%rcx), %xmm2
+ movaps 27(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 43(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 59(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $5, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $5, %xmm3, %xmm4
+ jnz L(Shl5Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave5)
+# endif
+ palignr $5, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl5LoopStart)
+
+L(Shl5LoopExit):
+ movdqu -5(%rcx), %xmm1
+ mov $11, %rsi
+ movdqu %xmm1, -5(%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl6):
+ movaps -6(%rcx), %xmm1
+ movaps 10(%rcx), %xmm2
+L(Shl6Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 26(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 26(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 26(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ lea 26(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -10(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -6(%rcx), %xmm1
+
+/* 64 bytes loop */
+ .p2align 4
+L(Shl6LoopStart):
+ movaps 10(%rcx), %xmm2
+ movaps 26(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 42(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 58(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $6, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $6, %xmm3, %xmm4
+ jnz L(Shl6Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave6)
+# endif
+ palignr $6, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl6LoopStart)
+
+L(Shl6LoopExit):
+ mov (%rcx), %r9
+ mov 6(%rcx), %esi
+ mov %r9, (%rdx)
+ mov %esi, 6(%rdx)
+ mov $10, %rsi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl7):
+ movaps -7(%rcx), %xmm1
+ movaps 9(%rcx), %xmm2
+L(Shl7Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 25(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 25(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 25(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ lea 25(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -9(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -7(%rcx), %xmm1
+
+/* 64 bytes loop */
+ .p2align 4
+L(Shl7LoopStart):
+ movaps 9(%rcx), %xmm2
+ movaps 25(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 41(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 57(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $7, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $7, %xmm3, %xmm4
+ jnz L(Shl7Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave7)
+# endif
+ palignr $7, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl7LoopStart)
+
+L(Shl7LoopExit):
+ mov (%rcx), %r9
+ mov 5(%rcx), %esi
+ mov %r9, (%rdx)
+ mov %esi, 5(%rdx)
+ mov $9, %rsi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl8):
+ movaps -8(%rcx), %xmm1
+ movaps 8(%rcx), %xmm2
+L(Shl8Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 24(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 24(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 24(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ lea 24(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -8(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -8(%rcx), %xmm1
+
+/* 64 bytes loop */
+ .p2align 4
+L(Shl8LoopStart):
+ movaps 8(%rcx), %xmm2
+ movaps 24(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 40(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 56(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $8, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $8, %xmm3, %xmm4
+ jnz L(Shl8Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave8)
+# endif
+ palignr $8, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+ mov (%rcx), %r9
+ mov $8, %rsi
+ mov %r9, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl9):
+ movaps -9(%rcx), %xmm1
+ movaps 7(%rcx), %xmm2
+L(Shl9Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 23(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 23(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 23(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ lea 23(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -7(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -9(%rcx), %xmm1
+
+/* 64 bytes loop */
+ .p2align 4
+L(Shl9LoopStart):
+ movaps 7(%rcx), %xmm2
+ movaps 23(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 39(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 55(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $9, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $9, %xmm3, %xmm4
+ jnz L(Shl9Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave9)
+# endif
+ palignr $9, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl9LoopStart)
+
+L(Shl9LoopExit):
+ mov -1(%rcx), %r9
+ mov $7, %rsi
+ mov %r9, -1(%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl10):
+ movaps -10(%rcx), %xmm1
+ movaps 6(%rcx), %xmm2
+L(Shl10Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 22(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 22(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 22(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ lea 22(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -6(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -10(%rcx), %xmm1
+
+/* 64 bytes loop */
+ .p2align 4
+L(Shl10LoopStart):
+ movaps 6(%rcx), %xmm2
+ movaps 22(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 38(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 54(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $10, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $10, %xmm3, %xmm4
+ jnz L(Shl10Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave10)
+# endif
+ palignr $10, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl10LoopStart)
+
+L(Shl10LoopExit):
+ mov -2(%rcx), %r9
+ mov $6, %rsi
+ mov %r9, -2(%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl11):
+ movaps -11(%rcx), %xmm1
+ movaps 5(%rcx), %xmm2
+L(Shl11Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 21(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 21(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 21(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ lea 21(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -5(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -11(%rcx), %xmm1
+
+/* 64 bytes loop */
+ .p2align 4
+L(Shl11LoopStart):
+ movaps 5(%rcx), %xmm2
+ movaps 21(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 37(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 53(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $11, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $11, %xmm3, %xmm4
+ jnz L(Shl11Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave11)
+# endif
+ palignr $11, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl11LoopStart)
+
+L(Shl11LoopExit):
+ mov -3(%rcx), %r9
+ mov $5, %rsi
+ mov %r9, -3(%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl12):
+ movaps -12(%rcx), %xmm1
+ movaps 4(%rcx), %xmm2
+L(Shl12Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 20(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 20(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 20(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ lea 20(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -4(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -12(%rcx), %xmm1
+
+/* 64 bytes loop */
+ .p2align 4
+L(Shl12LoopStart):
+ movaps 4(%rcx), %xmm2
+ movaps 20(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 36(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 52(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $12, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $12, %xmm3, %xmm4
+ jnz L(Shl12Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave12)
+# endif
+ palignr $12, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+ mov (%rcx), %r9d
+ mov $4, %rsi
+ mov %r9d, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl13):
+ movaps -13(%rcx), %xmm1
+ movaps 3(%rcx), %xmm2
+L(Shl13Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 19(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 19(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 19(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ lea 19(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -3(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -13(%rcx), %xmm1
+
+/* 64 bytes loop */
+ .p2align 4
+L(Shl13LoopStart):
+ movaps 3(%rcx), %xmm2
+ movaps 19(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 35(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 51(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $13, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $13, %xmm3, %xmm4
+ jnz L(Shl13Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave13)
+# endif
+ palignr $13, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl13LoopStart)
+
+L(Shl13LoopExit):
+ mov -1(%rcx), %r9d
+ mov $3, %rsi
+ mov %r9d, -1(%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl14):
+ movaps -14(%rcx), %xmm1
+ movaps 2(%rcx), %xmm2
+L(Shl14Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 18(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 18(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 18(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ lea 18(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -2(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -14(%rcx), %xmm1
+
+/* 64 bytes loop */
+ .p2align 4
+L(Shl14LoopStart):
+ movaps 2(%rcx), %xmm2
+ movaps 18(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 34(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 50(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $14, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $14, %xmm3, %xmm4
+ jnz L(Shl14Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave14)
+# endif
+ palignr $14, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl14LoopStart)
+
+L(Shl14LoopExit):
+ mov -2(%rcx), %r9d
+ mov $2, %rsi
+ mov %r9d, -2(%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl15):
+ movaps -15(%rcx), %xmm1
+ movaps 1(%rcx), %xmm2
+L(Shl15Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 17(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm1
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 17(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 17(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm3, %xmm2
+ movaps %xmm2, (%rdx)
+ lea 17(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -1(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -15(%rcx), %xmm1
+
+/* 64 bytes loop */
+ .p2align 4
+L(Shl15LoopStart):
+ movaps 1(%rcx), %xmm2
+ movaps 17(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 33(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 49(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $15, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $15, %xmm3, %xmm4
+ jnz L(Shl15Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave15)
+# endif
+ palignr $15, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl15LoopStart)
+
+L(Shl15LoopExit):
+ mov -3(%rcx), %r9d
+ mov $1, %rsi
+ mov %r9d, -3(%rdx)
+# ifdef USE_AS_STRCAT
+ jmp L(CopyFrom1To16Bytes)
+# endif
+
+# ifndef USE_AS_STRCAT
+
+ .p2align 4
+L(CopyFrom1To16Bytes):
+# ifdef USE_AS_STRNCPY
+ add $16, %r8
+# endif
+ add %rsi, %rdx
+ add %rsi, %rcx
+
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+
+ .p2align 4
+L(Exit8):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+# ifdef USE_AS_STPCPY
+ lea 7(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $8, %r8
+ lea 8(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(ExitHigh):
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x08, %ah
+ jnz L(Exit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+
+ .p2align 4
+L(Exit16):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 8(%rcx), %rax
+ mov %rax, 8(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 15(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ lea 16(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+# ifdef USE_AS_STRNCPY
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %r8
+ add %rsi, %rcx
+ lea (%rsi, %rdx), %rsi
+ lea -9(%r8), %rdx
+ and $1<<7, %dh
+ or %al, %dh
+ test %dh, %dh
+ lea (%rsi), %rdx
+ jz L(ExitHighCase2)
+
+ cmp $1, %r8
+ je L(Exit1)
+ test $0x01, %al
+ jnz L(Exit1)
+ cmp $2, %r8
+ je L(Exit2)
+ test $0x02, %al
+ jnz L(Exit2)
+ cmp $3, %r8
+ je L(Exit3)
+ test $0x04, %al
+ jnz L(Exit3)
+ cmp $4, %r8
+ je L(Exit4)
+ test $0x08, %al
+ jnz L(Exit4)
+ cmp $5, %r8
+ je L(Exit5)
+ test $0x10, %al
+ jnz L(Exit5)
+ cmp $6, %r8
+ je L(Exit6)
+ test $0x20, %al
+ jnz L(Exit6)
+ cmp $7, %r8
+ je L(Exit7)
+ test $0x40, %al
+ jnz L(Exit7)
+ jmp L(Exit8)
+
+ .p2align 4
+L(ExitHighCase2):
+ cmp $9, %r8
+ je L(Exit9)
+ test $0x01, %ah
+ jnz L(Exit9)
+ cmp $10, %r8
+ je L(Exit10)
+ test $0x02, %ah
+ jnz L(Exit10)
+ cmp $11, %r8
+ je L(Exit11)
+ test $0x04, %ah
+ jnz L(Exit11)
+ cmp $12, %r8
+ je L(Exit12)
+ test $0x8, %ah
+ jnz L(Exit12)
+ cmp $13, %r8
+ je L(Exit13)
+ test $0x10, %ah
+ jnz L(Exit13)
+ cmp $14, %r8
+ je L(Exit14)
+ test $0x20, %ah
+ jnz L(Exit14)
+ cmp $15, %r8
+ je L(Exit15)
+ test $0x40, %ah
+ jnz L(Exit15)
+ jmp L(Exit16)
+
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase3):
+ add $16, %r8
+ add %rsi, %rdx
+ add %rsi, %rcx
+
+ cmp $16, %r8
+ je L(Exit16)
+ cmp $8, %r8
+ je L(Exit8)
+ jg L(More8Case3)
+ cmp $4, %r8
+ je L(Exit4)
+ jg L(More4Case3)
+ cmp $2, %r8
+ jl L(Exit1)
+ je L(Exit2)
+ jg L(Exit3)
+L(More8Case3): /* but less than 16 */
+ cmp $12, %r8
+ je L(Exit12)
+ jl L(Less12Case3)
+ cmp $14, %r8
+ jl L(Exit13)
+ je L(Exit14)
+ jg L(Exit15)
+L(More4Case3): /* but less than 8 */
+ cmp $6, %r8
+ jl L(Exit5)
+ je L(Exit6)
+ jg L(Exit7)
+L(Less12Case3): /* but more than 8 */
+ cmp $10, %r8
+ jl L(Exit9)
+ je L(Exit10)
+ jg L(Exit11)
+# endif
+
+ .p2align 4
+L(Exit1):
+ movb (%rcx), %al
+ movb %al, (%rdx)
+# ifdef USE_AS_STPCPY
+ lea (%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $1, %r8
+ lea 1(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit2):
+ movw (%rcx), %ax
+ movw %ax, (%rdx)
+# ifdef USE_AS_STPCPY
+ lea 1(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $2, %r8
+ lea 2(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit3):
+ movw (%rcx), %ax
+ movw %ax, (%rdx)
+ movb 2(%rcx), %al
+ movb %al, 2(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 2(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $3, %r8
+ lea 3(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit4):
+ movl (%rcx), %eax
+ movl %eax, (%rdx)
+# ifdef USE_AS_STPCPY
+ lea 3(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $4, %r8
+ lea 4(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit5):
+ movl (%rcx), %eax
+ movl %eax, (%rdx)
+ movb 4(%rcx), %al
+ movb %al, 4(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 4(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $5, %r8
+ lea 5(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit6):
+ movl (%rcx), %eax
+ movl %eax, (%rdx)
+ movw 4(%rcx), %ax
+ movw %ax, 4(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 5(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $6, %r8
+ lea 6(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit7):
+ movl (%rcx), %eax
+ movl %eax, (%rdx)
+ movl 3(%rcx), %eax
+ movl %eax, 3(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 6(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $7, %r8
+ lea 7(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit9):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 5(%rcx), %eax
+ mov %eax, 5(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 8(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $9, %r8
+ lea 9(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit10):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 6(%rcx), %eax
+ mov %eax, 6(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 9(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $10, %r8
+ lea 10(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit11):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 7(%rcx), %eax
+ mov %eax, 7(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 10(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $11, %r8
+ lea 11(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit12):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 8(%rcx), %eax
+ mov %eax, 8(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 11(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $12, %r8
+ lea 12(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit13):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 5(%rcx), %rax
+ mov %rax, 5(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 12(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $13, %r8
+ lea 13(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit14):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 6(%rcx), %rax
+ mov %rax, 6(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 13(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $14, %r8
+ lea 14(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit15):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 7(%rcx), %rax
+ mov %rax, 7(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 14(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $15, %r8
+ lea 15(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+# ifdef USE_AS_STRNCPY
+ .p2align 4
+L(Fill0):
+ ret
+
+ .p2align 4
+L(Fill1):
+ movb %dl, (%rcx)
+ ret
+
+ .p2align 4
+L(Fill2):
+ movw %dx, (%rcx)
+ ret
+
+ .p2align 4
+L(Fill3):
+ movw %dx, (%rcx)
+ movb %dl, 2(%rcx)
+ ret
+
+ .p2align 4
+L(Fill4):
+ movl %edx, (%rcx)
+ ret
+
+ .p2align 4
+L(Fill5):
+ movl %edx, (%rcx)
+ movb %dl, 4(%rcx)
+ ret
+
+ .p2align 4
+L(Fill6):
+ movl %edx, (%rcx)
+ movw %dx, 4(%rcx)
+ ret
+
+ .p2align 4
+L(Fill7):
+ movl %edx, (%rcx)
+ movl %edx, 3(%rcx)
+ ret
+
+ .p2align 4
+L(Fill8):
+ mov %rdx, (%rcx)
+ ret
+
+ .p2align 4
+L(Fill9):
+ mov %rdx, (%rcx)
+ movb %dl, 8(%rcx)
+ ret
+
+ .p2align 4
+L(Fill10):
+ mov %rdx, (%rcx)
+ movw %dx, 8(%rcx)
+ ret
+
+ .p2align 4
+L(Fill11):
+ mov %rdx, (%rcx)
+ movl %edx, 7(%rcx)
+ ret
+
+ .p2align 4
+L(Fill12):
+ mov %rdx, (%rcx)
+ movl %edx, 8(%rcx)
+ ret
+
+ .p2align 4
+L(Fill13):
+ mov %rdx, (%rcx)
+ mov %rdx, 5(%rcx)
+ ret
+
+ .p2align 4
+L(Fill14):
+ mov %rdx, (%rcx)
+ mov %rdx, 6(%rcx)
+ ret
+
+ .p2align 4
+L(Fill15):
+ mov %rdx, (%rcx)
+ mov %rdx, 7(%rcx)
+ ret
+
+ .p2align 4
+L(Fill16):
+ mov %rdx, (%rcx)
+ mov %rdx, 8(%rcx)
+ ret
+
+ .p2align 4
+L(StrncpyFillExit1):
+ lea 16(%r8), %r8
+L(FillFrom1To16Bytes):
+ test %r8, %r8
+ jz L(Fill0)
+ cmp $16, %r8
+ je L(Fill16)
+ cmp $8, %r8
+ je L(Fill8)
+ jg L(FillMore8)
+ cmp $4, %r8
+ je L(Fill4)
+ jg L(FillMore4)
+ cmp $2, %r8
+ jl L(Fill1)
+ je L(Fill2)
+ jg L(Fill3)
+L(FillMore8): /* but less than 16 */
+ cmp $12, %r8
+ je L(Fill12)
+ jl L(FillLess12)
+ cmp $14, %r8
+ jl L(Fill13)
+ je L(Fill14)
+ jg L(Fill15)
+L(FillMore4): /* but less than 8 */
+ cmp $6, %r8
+ jl L(Fill5)
+ je L(Fill6)
+ jg L(Fill7)
+L(FillLess12): /* but more than 8 */
+ cmp $10, %r8
+ jl L(Fill9)
+ je L(Fill10)
+ jmp L(Fill11)
+
+ .p2align 4
+L(StrncpyFillTailWithZero1):
+ xor %rdx, %rdx
+ sub $16, %r8
+ jbe L(StrncpyFillExit1)
+
+ pxor %xmm0, %xmm0
+ mov %rdx, (%rcx)
+ mov %rdx, 8(%rcx)
+
+ lea 16(%rcx), %rcx
+
+ mov %rcx, %rdx
+ and $0xf, %rdx
+ sub %rdx, %rcx
+ add %rdx, %r8
+ xor %rdx, %rdx
+ sub $64, %r8
+ jb L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+ movdqa %xmm0, (%rcx)
+ movdqa %xmm0, 16(%rcx)
+ movdqa %xmm0, 32(%rcx)
+ movdqa %xmm0, 48(%rcx)
+ lea 64(%rcx), %rcx
+ sub $64, %r8
+ jae L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+ add $32, %r8
+ jl L(StrncpyFillLess32)
+ movdqa %xmm0, (%rcx)
+ movdqa %xmm0, 16(%rcx)
+ lea 32(%rcx), %rcx
+ sub $16, %r8
+ jl L(StrncpyFillExit1)
+ movdqa %xmm0, (%rcx)
+ lea 16(%rcx), %rcx
+ jmp L(FillFrom1To16Bytes)
+
+L(StrncpyFillLess32):
+ add $16, %r8
+ jl L(StrncpyFillExit1)
+ movdqa %xmm0, (%rcx)
+ lea 16(%rcx), %rcx
+ jmp L(FillFrom1To16Bytes)
+
+ .p2align 4
+L(Exit0):
+ mov %rdx, %rax
+ ret
+
+ .p2align 4
+L(StrncpyExit15Bytes):
+ cmp $9, %r8
+ je L(Exit9)
+ cmpb $0, 8(%rcx)
+ jz L(Exit9)
+ cmp $10, %r8
+ je L(Exit10)
+ cmpb $0, 9(%rcx)
+ jz L(Exit10)
+ cmp $11, %r8
+ je L(Exit11)
+ cmpb $0, 10(%rcx)
+ jz L(Exit11)
+ cmp $12, %r8
+ je L(Exit12)
+ cmpb $0, 11(%rcx)
+ jz L(Exit12)
+ cmp $13, %r8
+ je L(Exit13)
+ cmpb $0, 12(%rcx)
+ jz L(Exit13)
+ cmp $14, %r8
+ je L(Exit14)
+ cmpb $0, 13(%rcx)
+ jz L(Exit14)
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 7(%rcx), %rax
+ mov %rax, 7(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 14(%rdx), %rax
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# else
+ mov %rdi, %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit8Bytes):
+ cmp $1, %r8
+ je L(Exit1)
+ cmpb $0, (%rcx)
+ jz L(Exit1)
+ cmp $2, %r8
+ je L(Exit2)
+ cmpb $0, 1(%rcx)
+ jz L(Exit2)
+ cmp $3, %r8
+ je L(Exit3)
+ cmpb $0, 2(%rcx)
+ jz L(Exit3)
+ cmp $4, %r8
+ je L(Exit4)
+ cmpb $0, 3(%rcx)
+ jz L(Exit4)
+ cmp $5, %r8
+ je L(Exit5)
+ cmpb $0, 4(%rcx)
+ jz L(Exit5)
+ cmp $6, %r8
+ je L(Exit6)
+ cmpb $0, 5(%rcx)
+ jz L(Exit6)
+ cmp $7, %r8
+ je L(Exit7)
+ cmpb $0, 6(%rcx)
+ jz L(Exit7)
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+# ifdef USE_AS_STPCPY
+ lea 7(%rdx), %rax
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# else
+ mov %rdi, %rax
+# endif
+ ret
+
+# endif
+# endif
+
+# ifdef USE_AS_STRNCPY
+ .p2align 4
+L(StrncpyLeaveCase2OrCase3):
+ test %rax, %rax
+ jnz L(Aligned64LeaveCase2)
+
+L(Aligned64LeaveCase3):
+ lea 64(%r8), %r8
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase3)
+ movaps %xmm4, -64(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase3)
+ movaps %xmm5, -48(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase3)
+ movaps %xmm6, -32(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(Aligned64LeaveCase2):
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %rax
+ add $48, %r8
+ jle L(CopyFrom1To16BytesCase2OrCase3)
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm4, -64(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm5, -48(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm6, -32(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+ jmp L(CopyFrom1To16BytesCase2)
+/*--------------------------------------------------*/
+ .p2align 4
+L(StrncpyExit1Case2OrCase3):
+ movdqu -1(%rcx), %xmm0
+ movdqu %xmm0, -1(%rdx)
+ mov $15, %rsi
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit2Case2OrCase3):
+ movdqu -2(%rcx), %xmm0
+ movdqu %xmm0, -2(%rdx)
+ mov $14, %rsi
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit3Case2OrCase3):
+ movdqu -3(%rcx), %xmm0
+ movdqu %xmm0, -3(%rdx)
+ mov $13, %rsi
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit4Case2OrCase3):
+ movdqu -4(%rcx), %xmm0
+ movdqu %xmm0, -4(%rdx)
+ mov $12, %rsi
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit5Case2OrCase3):
+ movdqu -5(%rcx), %xmm0
+ movdqu %xmm0, -5(%rdx)
+ mov $11, %rsi
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit6Case2OrCase3):
+ mov (%rcx), %rsi
+ mov 6(%rcx), %r9d
+ mov %r9d, 6(%rdx)
+ mov %rsi, (%rdx)
+ test %rax, %rax
+ mov $10, %rsi
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit7Case2OrCase3):
+ mov (%rcx), %rsi
+ mov 5(%rcx), %r9d
+ mov %r9d, 5(%rdx)
+ mov %rsi, (%rdx)
+ test %rax, %rax
+ mov $9, %rsi
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit8Case2OrCase3):
+ mov (%rcx), %r9
+ mov $8, %rsi
+ mov %r9, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit9Case2OrCase3):
+ mov -1(%rcx), %r9
+ mov $7, %rsi
+ mov %r9, -1(%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit10Case2OrCase3):
+ mov -2(%rcx), %r9
+ mov $6, %rsi
+ mov %r9, -2(%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit11Case2OrCase3):
+ mov -3(%rcx), %r9
+ mov $5, %rsi
+ mov %r9, -3(%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit12Case2OrCase3):
+ mov (%rcx), %r9d
+ mov $4, %rsi
+ mov %r9d, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit13Case2OrCase3):
+ mov -1(%rcx), %r9d
+ mov $3, %rsi
+ mov %r9d, -1(%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit14Case2OrCase3):
+ mov -2(%rcx), %r9d
+ mov $2, %rsi
+ mov %r9d, -2(%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit15Case2OrCase3):
+ mov -3(%rcx), %r9d
+ mov $1, %rsi
+ mov %r9d, -3(%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyLeave1):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit1)
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 31(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit1)
+ palignr $1, %xmm3, %xmm2
+ movaps %xmm2, 16(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit1)
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit1)
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit1):
+ lea 15(%rdx, %rsi), %rdx
+ lea 15(%rcx, %rsi), %rcx
+ mov -15(%rcx), %rsi
+ mov -8(%rcx), %rax
+ mov %rsi, -15(%rdx)
+ mov %rax, -8(%rdx)
+ xor %rsi, %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyLeave2):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit2)
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 30(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit2)
+ palignr $2, %xmm3, %xmm2
+ movaps %xmm2, 16(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit2)
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit2)
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit2):
+ lea 14(%rdx, %rsi), %rdx
+ lea 14(%rcx, %rsi), %rcx
+ mov -14(%rcx), %rsi
+ mov -8(%rcx), %rax
+ mov %rsi, -14(%rdx)
+ mov %rax, -8(%rdx)
+ xor %rsi, %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyLeave3):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit3)
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 29(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit3)
+ palignr $3, %xmm3, %xmm2
+ movaps %xmm2, 16(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit3)
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit3)
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit3):
+ lea 13(%rdx, %rsi), %rdx
+ lea 13(%rcx, %rsi), %rcx
+ mov -13(%rcx), %rsi
+ mov -8(%rcx), %rax
+ mov %rsi, -13(%rdx)
+ mov %rax, -8(%rdx)
+ xor %rsi, %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyLeave4):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit4)
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 28(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit4)
+ palignr $4, %xmm3, %xmm2
+ movaps %xmm2, 16(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit4)
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit4)
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit4):
+ lea 12(%rdx, %rsi), %rdx
+ lea 12(%rcx, %rsi), %rcx
+ mov -12(%rcx), %rsi
+ mov -4(%rcx), %eax
+ mov %rsi, -12(%rdx)
+ mov %eax, -4(%rdx)
+ xor %rsi, %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyLeave5):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit5)
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 27(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit5)
+ palignr $5, %xmm3, %xmm2
+ movaps %xmm2, 16(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit5)
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit5)
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit5):
+ lea 11(%rdx, %rsi), %rdx
+ lea 11(%rcx, %rsi), %rcx
+ mov -11(%rcx), %rsi
+ mov -4(%rcx), %eax
+ mov %rsi, -11(%rdx)
+ mov %eax, -4(%rdx)
+ xor %rsi, %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyLeave6):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit6)
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 26(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit6)
+ palignr $6, %xmm3, %xmm2
+ movaps %xmm2, 16(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit6)
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit6)
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit6):
+ lea 10(%rdx, %rsi), %rdx
+ lea 10(%rcx, %rsi), %rcx
+ mov -10(%rcx), %rsi
+ movw -2(%rcx), %ax
+ mov %rsi, -10(%rdx)
+ movw %ax, -2(%rdx)
+ xor %rsi, %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyLeave7):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit7)
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 25(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit7)
+ palignr $7, %xmm3, %xmm2
+ movaps %xmm2, 16(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit7)
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit7)
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit7):
+ lea 9(%rdx, %rsi), %rdx
+ lea 9(%rcx, %rsi), %rcx
+ mov -9(%rcx), %rsi
+ movb -1(%rcx), %ah
+ mov %rsi, -9(%rdx)
+ movb %ah, -1(%rdx)
+ xor %rsi, %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyLeave8):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit8)
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 24(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit8)
+ palignr $8, %xmm3, %xmm2
+ movaps %xmm2, 16(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit8)
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit8)
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit8):
+ lea 8(%rdx, %rsi), %rdx
+ lea 8(%rcx, %rsi), %rcx
+ mov -8(%rcx), %rax
+ xor %rsi, %rsi
+ mov %rax, -8(%rdx)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyLeave9):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit9)
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 23(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit9)
+ palignr $9, %xmm3, %xmm2
+ movaps %xmm2, 16(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit9)
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit9)
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit9):
+ lea 7(%rdx, %rsi), %rdx
+ lea 7(%rcx, %rsi), %rcx
+ mov -8(%rcx), %rax
+ xor %rsi, %rsi
+ mov %rax, -8(%rdx)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyLeave10):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit10)
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 22(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit10)
+ palignr $10, %xmm3, %xmm2
+ movaps %xmm2, 16(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit10)
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit10)
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit10):
+ lea 6(%rdx, %rsi), %rdx
+ lea 6(%rcx, %rsi), %rcx
+ mov -8(%rcx), %rax
+ xor %rsi, %rsi
+ mov %rax, -8(%rdx)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyLeave11):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit11)
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 21(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit11)
+ palignr $11, %xmm3, %xmm2
+ movaps %xmm2, 16(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit11)
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit11)
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit11):
+ lea 5(%rdx, %rsi), %rdx
+ lea 5(%rcx, %rsi), %rcx
+ mov -8(%rcx), %rax
+ xor %rsi, %rsi
+ mov %rax, -8(%rdx)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyLeave12):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit12)
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 20(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit12)
+ palignr $12, %xmm3, %xmm2
+ movaps %xmm2, 16(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit12)
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit12)
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit12):
+ lea 4(%rdx, %rsi), %rdx
+ lea 4(%rcx, %rsi), %rcx
+ mov -4(%rcx), %eax
+ xor %rsi, %rsi
+ mov %eax, -4(%rdx)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyLeave13):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit13)
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 19(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit13)
+ palignr $13, %xmm3, %xmm2
+ movaps %xmm2, 16(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit13)
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit13)
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit13):
+ lea 3(%rdx, %rsi), %rdx
+ lea 3(%rcx, %rsi), %rcx
+ mov -4(%rcx), %eax
+ xor %rsi, %rsi
+ mov %eax, -4(%rdx)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyLeave14):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit14)
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 18(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit14)
+ palignr $14, %xmm3, %xmm2
+ movaps %xmm2, 16(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit14)
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit14)
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit14):
+ lea 2(%rdx, %rsi), %rdx
+ lea 2(%rcx, %rsi), %rcx
+ movw -2(%rcx), %ax
+ xor %rsi, %rsi
+ movw %ax, -2(%rdx)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyLeave15):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit15)
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 17(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit15)
+ palignr $15, %xmm3, %xmm2
+ movaps %xmm2, 16(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit15)
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit15)
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit15):
+ lea 1(%rdx, %rsi), %rdx
+ lea 1(%rcx, %rsi), %rcx
+ movb -1(%rcx), %ah
+ xor %rsi, %rsi
+ movb %ah, -1(%rdx)
+ jmp L(CopyFrom1To16BytesCase3)
+
+# endif
+# ifndef USE_AS_STRCAT
+END (STRCPY)
+# endif
+#endif