aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
diff options
context:
space:
mode:
authorH.J. Lu <hongjiu.lu@intel.com>2011-06-24 14:15:32 -0400
committerUlrich Drepper <drepper@gmail.com>2011-06-24 14:15:32 -0400
commit0b1cbaaef5ccc21baf2c35d4698fb28e82eab385 (patch)
treec1f6ad8a49ef79510355c765ad3e385067e7ade0 /sysdeps/i386/i686/multiarch/strcpy-ssse3.S
parent07f494a027b3adea1f3cd0cd4ca7c10949cdc476 (diff)
downloadglibc-0b1cbaaef5ccc21baf2c35d4698fb28e82eab385.tar
glibc-0b1cbaaef5ccc21baf2c35d4698fb28e82eab385.tar.gz
glibc-0b1cbaaef5ccc21baf2c35d4698fb28e82eab385.tar.bz2
glibc-0b1cbaaef5ccc21baf2c35d4698fb28e82eab385.zip
Optimized st{r,p}{,n}cpy for SSE2/SSSE3 on x86-32
Diffstat (limited to 'sysdeps/i386/i686/multiarch/strcpy-ssse3.S')
-rw-r--r--sysdeps/i386/i686/multiarch/strcpy-ssse3.S4090
1 files changed, 4090 insertions, 0 deletions
diff --git a/sysdeps/i386/i686/multiarch/strcpy-ssse3.S b/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
new file mode 100644
index 0000000000..577d11789c
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
@@ -0,0 +1,4090 @@
+/* strcpy with SSSE3
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef STRCPY
+# define STRCPY __strcpy_ssse3
+# endif
+
+# ifdef USE_AS_STRNCPY
+# define PARMS 8
+# define ENTRANCE PUSH(%ebx)
+# define RETURN POP(%ebx); ret; CFI_PUSH(%ebx);
+# define RETURN1 POP(%edi); POP(%ebx); ret; CFI_PUSH(%ebx); CFI_PUSH(%edi)
+# else
+# define PARMS 4
+# define ENTRANCE
+# define RETURN ret
+# define RETURN1 POP(%edi); ret; CFI_PUSH(%edi)
+# endif
+
+# define STR1 PARMS
+# define STR2 STR1+4
+# define LEN STR2+4
+
+/* In this code following instructions are used for copying:
+ movb - 1 byte
+ movw - 2 byte
+ movl - 4 byte
+ movlpd - 8 byte
+ movaps - 16 byte - requires 16 byte alignment
+ of sourse and destination adresses.
+ 16 byte alignment: adress is 32bit value,
+ right four bit of adress shall be 0.
+*/
+
+.text
+ENTRY (STRCPY)
+ ENTRANCE
+ mov STR1(%esp), %edx
+ mov STR2(%esp), %ecx
+# ifdef USE_AS_STRNCPY
+ movl LEN(%esp), %ebx
+ test %ebx, %ebx
+ jz L(ExitTail0)
+ cmp $8, %ebx
+ jbe L(StrncpyExit8Bytes)
+# endif
+ cmpb $0, (%ecx)
+ jz L(ExitTail1)
+ cmpb $0, 1(%ecx)
+ jz L(ExitTail2)
+ cmpb $0, 2(%ecx)
+ jz L(ExitTail3)
+ cmpb $0, 3(%ecx)
+ jz L(ExitTail4)
+ cmpb $0, 4(%ecx)
+ jz L(ExitTail5)
+ cmpb $0, 5(%ecx)
+ jz L(ExitTail6)
+ cmpb $0, 6(%ecx)
+ jz L(ExitTail7)
+ cmpb $0, 7(%ecx)
+ jz L(ExitTail8)
+# ifdef USE_AS_STRNCPY
+ cmp $16, %ebx
+ jb L(StrncpyExit15Bytes)
+# endif
+ cmpb $0, 8(%ecx)
+ jz L(ExitTail9)
+ cmpb $0, 9(%ecx)
+ jz L(ExitTail10)
+ cmpb $0, 10(%ecx)
+ jz L(ExitTail11)
+ cmpb $0, 11(%ecx)
+ jz L(ExitTail12)
+ cmpb $0, 12(%ecx)
+ jz L(ExitTail13)
+ cmpb $0, 13(%ecx)
+ jz L(ExitTail14)
+ cmpb $0, 14(%ecx)
+ jz L(ExitTail15)
+# ifdef USE_AS_STRNCPY
+ cmp $16, %ebx
+ je L(ExitTail16)
+# endif
+ cmpb $0, 15(%ecx)
+ jz L(ExitTail16)
+
+ PUSH (%edi)
+ mov %edx, %edi
+ PUSH (%esi)
+# ifdef USE_AS_STRNCPY
+ mov %ecx, %esi
+ and $0xf, %esi
+
+/* add 16 bytes ecx_shift to ebx */
+
+ add %esi, %ebx
+# endif
+ lea 16(%ecx), %esi
+/* Now:
+ esi = alignment_16(ecx) + ecx_shift + 16;
+ ecx_shift = ecx - alignment_16(ecx)
+*/
+ and $-16, %esi
+/* Now:
+ esi = alignment_16(ecx) + 16
+*/
+ pxor %xmm0, %xmm0
+ movlpd (%ecx), %xmm1
+ movlpd %xmm1, (%edx)
+/*
+ look if there is zero symbol in next 16 bytes of string
+ from esi to esi + 15 and form mask in xmm0
+*/
+ pcmpeqb (%esi), %xmm0
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm1, 8(%edx)
+
+/* convert byte mask in xmm0 to bit mask */
+
+ pmovmskb %xmm0, %eax
+ sub %ecx, %esi
+
+/* esi = 16 - ecx_shift */
+
+/* eax = 0: there isn't end of string from position esi to esi+15 */
+
+# ifdef USE_AS_STRNCPY
+ sub $32, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ mov %edx, %eax
+ lea 16(%edx), %edx
+/* Now:
+ edx = edx + 16 = alignment_16(edx) + edx_shift + 16
+*/
+ and $-16, %edx
+
+/* Now: edx = alignment_16(edx) + 16 */
+
+ sub %edx, %eax
+
+/* Now: eax = edx_shift - 16 */
+
+# ifdef USE_AS_STRNCPY
+ add %eax, %esi
+ lea -1(%esi), %esi
+ and $1<<31, %esi
+ test %esi, %esi
+ jnz L(ContinueCopy)
+ lea 16(%ebx), %ebx
+
+L(ContinueCopy):
+# endif
+ sub %eax, %ecx
+/* Now:
+ case ecx_shift >= edx_shift:
+ ecx = alignment_16(ecx) + (ecx_shift - edx_shift) + 16
+ case ecx_shift < edx_shift:
+ ecx = alignment_16(ecx) + (16 + ecx_shift - edx_shift)
+*/
+ mov %ecx, %eax
+ and $0xf, %eax
+/* Now:
+ case ecx_shift >= edx_shift: eax = ecx_shift - edx_shift
+ case ecx_shift < edx_shift: eax = (16 + ecx_shift - edx_shift)
+ eax can be 0, 1, ..., 15
+*/
+ mov $0, %esi
+
+/* case: ecx_shift == edx_shift */
+
+ jz L(Align16Both)
+
+ cmp $8, %eax
+ jae L(ShlHigh8)
+ cmp $1, %eax
+ je L(Shl1)
+ cmp $2, %eax
+ je L(Shl2)
+ cmp $3, %eax
+ je L(Shl3)
+ cmp $4, %eax
+ je L(Shl4)
+ cmp $5, %eax
+ je L(Shl5)
+ cmp $6, %eax
+ je L(Shl6)
+ jmp L(Shl7)
+
+L(ShlHigh8):
+ je L(Shl8)
+ cmp $9, %eax
+ je L(Shl9)
+ cmp $10, %eax
+ je L(Shl10)
+ cmp $11, %eax
+ je L(Shl11)
+ cmp $12, %eax
+ je L(Shl12)
+ cmp $13, %eax
+ je L(Shl13)
+ cmp $14, %eax
+ je L(Shl14)
+ jmp L(Shl15)
+
+L(Align16Both):
+ movaps (%ecx), %xmm1
+ movaps 16(%ecx), %xmm2
+ movaps %xmm1, (%edx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm3
+ movaps %xmm2, (%edx, %esi)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm4
+ movaps %xmm3, (%edx, %esi)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm1
+ movaps %xmm4, (%edx, %esi)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm2
+ movaps %xmm1, (%edx, %esi)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm3
+ movaps %xmm2, (%edx, %esi)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm3, (%edx, %esi)
+ mov %ecx, %eax
+ lea 16(%ecx, %esi), %ecx
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ lea 48+64(%ebx, %eax), %ebx
+# endif
+ mov $-0x40, %esi
+
+L(Aligned64Loop):
+ movaps (%ecx), %xmm2
+ movaps 32(%ecx), %xmm3
+ movaps %xmm2, %xmm4
+ movaps 16(%ecx), %xmm5
+ movaps %xmm3, %xmm6
+ movaps 48(%ecx), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ lea 64(%edx), %edx
+ pcmpeqb %xmm0, %xmm3
+ lea 64(%ecx), %ecx
+ pmovmskb %xmm3, %eax
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeaveCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Aligned64Leave)
+ movaps %xmm4, -64(%edx)
+ movaps %xmm5, -48(%edx)
+ movaps %xmm6, -32(%edx)
+ movaps %xmm7, -16(%edx)
+ jmp L(Aligned64Loop)
+
+L(Aligned64Leave):
+# ifdef USE_AS_STRNCPY
+ lea 48(%ebx), %ebx
+# endif
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm5, %xmm0
+# ifdef USE_AS_STRNCPY
+ lea -16(%ebx), %ebx
+# endif
+ pmovmskb %xmm0, %eax
+ movaps %xmm4, -64(%edx)
+ test %eax, %eax
+ lea 16(%esi), %esi
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm6, %xmm0
+# ifdef USE_AS_STRNCPY
+ lea -16(%ebx), %ebx
+# endif
+ pmovmskb %xmm0, %eax
+ movaps %xmm5, -48(%edx)
+ test %eax, %eax
+ lea 16(%esi), %esi
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm6, -32(%edx)
+ pcmpeqb %xmm7, %xmm0
+# ifdef USE_AS_STRNCPY
+ lea -16(%ebx), %ebx
+# endif
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl1):
+ movaps -1(%ecx), %xmm1
+ movaps 15(%ecx), %xmm2
+L(Shl1Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 31(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 31(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 31(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 31(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -15(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -1(%ecx), %xmm1
+
+L(Shl1LoopStart):
+ movaps 15(%ecx), %xmm2
+ movaps 31(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 47(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 63(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $1, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $1, %xmm3, %xmm4
+ jnz L(Shl1Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave1)
+# endif
+ palignr $1, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl1LoopStart)
+
+L(Shl1LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $15, %xmm6
+ mov $15, %esi
+ palignr $1, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl2):
+ movaps -2(%ecx), %xmm1
+ movaps 14(%ecx), %xmm2
+L(Shl2Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 30(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 30(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 30(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 30(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -14(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -2(%ecx), %xmm1
+
+L(Shl2LoopStart):
+ movaps 14(%ecx), %xmm2
+ movaps 30(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 46(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 62(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $2, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $2, %xmm3, %xmm4
+ jnz L(Shl2Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave2)
+# endif
+ palignr $2, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl2LoopStart)
+
+L(Shl2LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $14, %xmm6
+ mov $14, %esi
+ palignr $2, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl3):
+ movaps -3(%ecx), %xmm1
+ movaps 13(%ecx), %xmm2
+L(Shl3Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 29(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 29(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 29(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 29(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -13(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -3(%ecx), %xmm1
+
+L(Shl3LoopStart):
+ movaps 13(%ecx), %xmm2
+ movaps 29(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 45(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 61(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $3, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $3, %xmm3, %xmm4
+ jnz L(Shl3Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave3)
+# endif
+ palignr $3, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl3LoopStart)
+
+L(Shl3LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $13, %xmm6
+ mov $13, %esi
+ palignr $3, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl4):
+ movaps -4(%ecx), %xmm1
+ movaps 12(%ecx), %xmm2
+L(Shl4Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 28(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -12(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -4(%ecx), %xmm1
+
+L(Shl4LoopStart):
+ movaps 12(%ecx), %xmm2
+ movaps 28(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 44(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 60(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $4, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $4, %xmm3, %xmm4
+ jnz L(Shl4Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave4)
+# endif
+ palignr $4, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $12, %xmm6
+ mov $12, %esi
+ palignr $4, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl5):
+ movaps -5(%ecx), %xmm1
+ movaps 11(%ecx), %xmm2
+L(Shl5Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 27(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 27(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 27(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 27(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -11(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -5(%ecx), %xmm1
+
+L(Shl5LoopStart):
+ movaps 11(%ecx), %xmm2
+ movaps 27(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 43(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 59(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $5, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $5, %xmm3, %xmm4
+ jnz L(Shl5Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave5)
+# endif
+ palignr $5, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl5LoopStart)
+
+L(Shl5LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $11, %xmm6
+ mov $11, %esi
+ palignr $5, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl6):
+ movaps -6(%ecx), %xmm1
+ movaps 10(%ecx), %xmm2
+L(Shl6Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 26(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 26(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 26(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 26(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -10(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -6(%ecx), %xmm1
+
+L(Shl6LoopStart):
+ movaps 10(%ecx), %xmm2
+ movaps 26(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 42(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 58(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $6, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $6, %xmm3, %xmm4
+ jnz L(Shl6Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave6)
+# endif
+ palignr $6, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl6LoopStart)
+
+L(Shl6LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $10, %xmm6
+ mov $10, %esi
+ palignr $6, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl7):
+ movaps -7(%ecx), %xmm1
+ movaps 9(%ecx), %xmm2
+L(Shl7Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 25(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 25(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 25(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 25(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -9(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -7(%ecx), %xmm1
+
+L(Shl7LoopStart):
+ movaps 9(%ecx), %xmm2
+ movaps 25(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 41(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 57(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $7, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $7, %xmm3, %xmm4
+ jnz L(Shl7Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave7)
+# endif
+ palignr $7, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl7LoopStart)
+
+L(Shl7LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $9, %xmm6
+ mov $9, %esi
+ palignr $7, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl8):
+ movaps -8(%ecx), %xmm1
+ movaps 8(%ecx), %xmm2
+L(Shl8Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 24(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -8(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -8(%ecx), %xmm1
+
+L(Shl8LoopStart):
+ movaps 8(%ecx), %xmm2
+ movaps 24(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 40(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 56(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $8, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $8, %xmm3, %xmm4
+ jnz L(Shl8Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave8)
+# endif
+ palignr $8, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $8, %xmm6
+ mov $8, %esi
+ palignr $8, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl9):
+ movaps -9(%ecx), %xmm1
+ movaps 7(%ecx), %xmm2
+L(Shl9Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 23(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 23(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 23(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 23(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -7(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -9(%ecx), %xmm1
+
+L(Shl9LoopStart):
+ movaps 7(%ecx), %xmm2
+ movaps 23(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 39(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 55(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $9, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $9, %xmm3, %xmm4
+ jnz L(Shl9Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave9)
+# endif
+ palignr $9, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl9LoopStart)
+
+L(Shl9LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $7, %xmm6
+ mov $7, %esi
+ palignr $9, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl10):
+ movaps -10(%ecx), %xmm1
+ movaps 6(%ecx), %xmm2
+L(Shl10Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 22(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 22(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 22(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 22(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -6(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -10(%ecx), %xmm1
+
+L(Shl10LoopStart):
+ movaps 6(%ecx), %xmm2
+ movaps 22(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 38(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 54(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $10, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $10, %xmm3, %xmm4
+ jnz L(Shl10Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave10)
+# endif
+ palignr $10, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl10LoopStart)
+
+L(Shl10LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $6, %xmm6
+ mov $6, %esi
+ palignr $10, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl11):
+ movaps -11(%ecx), %xmm1
+ movaps 5(%ecx), %xmm2
+L(Shl11Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 21(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 21(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 21(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 21(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -5(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -11(%ecx), %xmm1
+
+L(Shl11LoopStart):
+ movaps 5(%ecx), %xmm2
+ movaps 21(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 37(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 53(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $11, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $11, %xmm3, %xmm4
+ jnz L(Shl11Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave11)
+# endif
+ palignr $11, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl11LoopStart)
+
+L(Shl11LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $5, %xmm6
+ mov $5, %esi
+ palignr $11, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl12):
+ movaps -12(%ecx), %xmm1
+ movaps 4(%ecx), %xmm2
+L(Shl12Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 20(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -4(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -12(%ecx), %xmm1
+
+L(Shl12LoopStart):
+ movaps 4(%ecx), %xmm2
+ movaps 20(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 36(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 52(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $12, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $12, %xmm3, %xmm4
+ jnz L(Shl12Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave12)
+# endif
+ palignr $12, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $4, %xmm6
+ mov $4, %esi
+ palignr $12, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl13):
+ movaps -13(%ecx), %xmm1
+ movaps 3(%ecx), %xmm2
+L(Shl13Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 19(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 19(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 19(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 19(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -3(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -13(%ecx), %xmm1
+
+L(Shl13LoopStart):
+ movaps 3(%ecx), %xmm2
+ movaps 19(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 35(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 51(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $13, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $13, %xmm3, %xmm4
+ jnz L(Shl13Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave13)
+# endif
+ palignr $13, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl13LoopStart)
+
+L(Shl13LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $3, %xmm6
+ mov $3, %esi
+ palignr $13, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl14):
+ movaps -14(%ecx), %xmm1
+ movaps 2(%ecx), %xmm2
+L(Shl14Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 18(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 18(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 18(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 18(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -2(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -14(%ecx), %xmm1
+
+L(Shl14LoopStart):
+ movaps 2(%ecx), %xmm2
+ movaps 18(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 34(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 50(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $14, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $14, %xmm3, %xmm4
+ jnz L(Shl14Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave14)
+# endif
+ palignr $14, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl14LoopStart)
+
+L(Shl14LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $2, %xmm6
+ mov $2, %esi
+ palignr $14, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl15):
+ movaps -15(%ecx), %xmm1
+ movaps 1(%ecx), %xmm2
+L(Shl15Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 17(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 17(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 17(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 17(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -1(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -15(%ecx), %xmm1
+
+L(Shl15LoopStart):
+ movaps 1(%ecx), %xmm2
+ movaps 17(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 33(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 49(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $15, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $15, %xmm3, %xmm4
+ jnz L(Shl15Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave15)
+# endif
+ palignr $15, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl15LoopStart)
+
+L(Shl15LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $1, %xmm6
+ mov $1, %esi
+ palignr $15, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+
+ .p2align 4
+L(CopyFrom1To16Bytes):
+# ifdef USE_AS_STRNCPY
+ add $16, %ebx
+# endif
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+
+ .p2align 4
+L(Exit8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 7(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $8, %ebx
+ lea 8(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitHigh):
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x08, %ah
+ jnz L(Exit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+
+ .p2align 4
+L(Exit16):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 8(%ecx), %xmm0
+ movlpd %xmm0, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 15(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ lea 16(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+# ifdef USE_AS_STRNCPY
+
+ CFI_PUSH(%esi)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %ebx
+ add %esi, %ecx
+ lea (%esi, %edx), %esi
+ lea -9(%ebx), %edx
+ and $1<<7, %dh
+ or %al, %dh
+ test %dh, %dh
+ lea (%esi), %edx
+ POP (%esi)
+ jz L(ExitHighCase2)
+
+ cmp $1, %ebx
+ je L(Exit1)
+ test $0x01, %al
+ jnz L(Exit1)
+ cmp $2, %ebx
+ je L(Exit2)
+ test $0x02, %al
+ jnz L(Exit2)
+ cmp $3, %ebx
+ je L(Exit3)
+ test $0x04, %al
+ jnz L(Exit3)
+ cmp $4, %ebx
+ je L(Exit4)
+ test $0x08, %al
+ jnz L(Exit4)
+ cmp $5, %ebx
+ je L(Exit5)
+ test $0x10, %al
+ jnz L(Exit5)
+ cmp $6, %ebx
+ je L(Exit6)
+ test $0x20, %al
+ jnz L(Exit6)
+ cmp $7, %ebx
+ je L(Exit7)
+ test $0x40, %al
+ jnz L(Exit7)
+ jmp L(Exit8)
+
+ .p2align 4
+L(ExitHighCase2):
+ cmp $9, %ebx
+ je L(Exit9)
+ test $0x01, %ah
+ jnz L(Exit9)
+ cmp $10, %ebx
+ je L(Exit10)
+ test $0x02, %ah
+ jnz L(Exit10)
+ cmp $11, %ebx
+ je L(Exit11)
+ test $0x04, %ah
+ jnz L(Exit11)
+ cmp $12, %ebx
+ je L(Exit12)
+ test $0x8, %ah
+ jnz L(Exit12)
+ cmp $13, %ebx
+ je L(Exit13)
+ test $0x10, %ah
+ jnz L(Exit13)
+ cmp $14, %ebx
+ je L(Exit14)
+ test $0x20, %ah
+ jnz L(Exit14)
+ cmp $15, %ebx
+ je L(Exit15)
+ test $0x40, %ah
+ jnz L(Exit15)
+ jmp L(Exit16)
+
+ CFI_PUSH(%esi)
+
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase3):
+ add $16, %ebx
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+ cmp $16, %ebx
+ je L(Exit16)
+ cmp $8, %ebx
+ je L(Exit8)
+ jg L(More8Case3)
+ cmp $4, %ebx
+ je L(Exit4)
+ jg L(More4Case3)
+ cmp $2, %ebx
+ jl L(Exit1)
+ je L(Exit2)
+ jg L(Exit3)
+L(More8Case3): /* but less than 16 */
+ cmp $12, %ebx
+ je L(Exit12)
+ jl L(Less12Case3)
+ cmp $14, %ebx
+ jl L(Exit13)
+ je L(Exit14)
+ jg L(Exit15)
+L(More4Case3): /* but less than 8 */
+ cmp $6, %ebx
+ jl L(Exit5)
+ je L(Exit6)
+ jg L(Exit7)
+L(Less12Case3): /* but more than 8 */
+ cmp $10, %ebx
+ jl L(Exit9)
+ je L(Exit10)
+ jg L(Exit11)
+# endif
+
+ .p2align 4
+L(Exit1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+# ifdef USE_AS_STPCPY
+ lea (%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $1, %ebx
+ lea 1(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 1(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $2, %ebx
+ lea 2(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+# ifdef USE_AS_STPCPY
+ lea 2(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $3, %ebx
+ lea 3(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 3(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $4, %ebx
+ lea 4(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 4(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $5, %ebx
+ lea 5(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 5(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $6, %ebx
+ lea 6(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+# ifdef USE_AS_STPCPY
+ lea 6(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $7, %ebx
+ lea 7(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit9):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movb 8(%ecx), %al
+ movb %al, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 8(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $9, %ebx
+ lea 9(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit10):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movw 8(%ecx), %ax
+ movw %ax, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 9(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $10, %ebx
+ lea 10(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit11):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 7(%ecx), %eax
+ movl %eax, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 10(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $11, %ebx
+ lea 11(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit12):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 11(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $12, %ebx
+ lea 12(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit13):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 5(%ecx), %xmm0
+ movlpd %xmm0, 5(%edx)
+# ifdef USE_AS_STPCPY
+ lea 12(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $13, %ebx
+ lea 13(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit14):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 6(%ecx), %xmm0
+ movlpd %xmm0, 6(%edx)
+# ifdef USE_AS_STPCPY
+ lea 13(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $14, %ebx
+ lea 14(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit15):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 14(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $15, %ebx
+ lea 15(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+CFI_POP (%edi)
+
+# ifdef USE_AS_STRNCPY
+ .p2align 4
+L(Fill0):
+ RETURN
+
+ .p2align 4
+L(Fill1):
+ movb %dl, (%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill2):
+ movw %dx, (%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill3):
+ movw %dx, (%ecx)
+ movb %dl, 2(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill4):
+ movl %edx, (%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill5):
+ movl %edx, (%ecx)
+ movb %dl, 4(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill6):
+ movl %edx, (%ecx)
+ movw %dx, 4(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill7):
+ movl %edx, (%ecx)
+ movl %edx, 3(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill8):
+ movlpd %xmm0, (%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill9):
+ movlpd %xmm0, (%ecx)
+ movb %dl, 8(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill10):
+ movlpd %xmm0, (%ecx)
+ movw %dx, 8(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill11):
+ movlpd %xmm0, (%ecx)
+ movl %edx, 7(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill12):
+ movlpd %xmm0, (%ecx)
+ movl %edx, 8(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill13):
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 5(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill14):
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 6(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill15):
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 7(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill16):
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 8(%ecx)
+ RETURN
+
+ .p2align 4
+L(StrncpyFillExit1):
+ lea 16(%ebx), %ebx
+L(FillFrom1To16Bytes):
+ test %ebx, %ebx
+ jz L(Fill0)
+ cmp $16, %ebx
+ je L(Fill16)
+ cmp $8, %ebx
+ je L(Fill8)
+ jg L(FillMore8)
+ cmp $4, %ebx
+ je L(Fill4)
+ jg L(FillMore4)
+ cmp $2, %ebx
+ jl L(Fill1)
+ je L(Fill2)
+ jg L(Fill3)
+L(FillMore8): /* but less than 16 */
+ cmp $12, %ebx
+ je L(Fill12)
+ jl L(FillLess12)
+ cmp $14, %ebx
+ jl L(Fill13)
+ je L(Fill14)
+ jg L(Fill15)
+L(FillMore4): /* but less than 8 */
+ cmp $6, %ebx
+ jl L(Fill5)
+ je L(Fill6)
+ jg L(Fill7)
+L(FillLess12): /* but more than 8 */
+ cmp $10, %ebx
+ jl L(Fill9)
+ je L(Fill10)
+ jmp L(Fill11)
+
+ CFI_PUSH(%edi)
+
+ .p2align 4
+L(StrncpyFillTailWithZero1):
+ POP (%edi)
+L(StrncpyFillTailWithZero):
+ pxor %xmm0, %xmm0
+ xor %edx, %edx
+ sub $16, %ebx
+ jbe L(StrncpyFillExit1)
+
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 8(%ecx)
+
+ lea 16(%ecx), %ecx
+
+ mov %ecx, %edx
+ and $0xf, %edx
+ sub %edx, %ecx
+ add %edx, %ebx
+ xor %edx, %edx
+ sub $64, %ebx
+ jb L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+ movdqa %xmm0, (%ecx)
+ movdqa %xmm0, 16(%ecx)
+ movdqa %xmm0, 32(%ecx)
+ movdqa %xmm0, 48(%ecx)
+ lea 64(%ecx), %ecx
+ sub $64, %ebx
+ jae L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+ add $32, %ebx
+ jl L(StrncpyFillLess32)
+ movdqa %xmm0, (%ecx)
+ movdqa %xmm0, 16(%ecx)
+ lea 32(%ecx), %ecx
+ sub $16, %ebx
+ jl L(StrncpyFillExit1)
+ movdqa %xmm0, (%ecx)
+ lea 16(%ecx), %ecx
+ jmp L(FillFrom1To16Bytes)
+
+L(StrncpyFillLess32):
+ add $16, %ebx
+ jl L(StrncpyFillExit1)
+ movdqa %xmm0, (%ecx)
+ lea 16(%ecx), %ecx
+ jmp L(FillFrom1To16Bytes)
+# endif
+
+ .p2align 4
+L(ExitTail1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+# ifdef USE_AS_STPCPY
+ lea (%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $1, %ebx
+ lea 1(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 1(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $2, %ebx
+ lea 2(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+# ifdef USE_AS_STPCPY
+ lea 2(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $3, %ebx
+ lea 3(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 3(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $4, %ebx
+ lea 4(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 4(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $5, %ebx
+ lea 5(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 5(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $6, %ebx
+ lea 6(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+# ifdef USE_AS_STPCPY
+ lea 6(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $7, %ebx
+ lea 7(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail9):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movb 8(%ecx), %al
+ movb %al, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 8(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $9, %ebx
+ lea 9(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail10):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movw 8(%ecx), %ax
+ movw %ax, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 9(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $10, %ebx
+ lea 10(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail11):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 7(%ecx), %eax
+ movl %eax, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 10(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $11, %ebx
+ lea 11(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail12):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 11(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $12, %ebx
+ lea 12(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail13):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 5(%ecx), %xmm0
+ movlpd %xmm0, 5(%edx)
+# ifdef USE_AS_STPCPY
+ lea 12(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $13, %ebx
+ lea 13(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail14):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 6(%ecx), %xmm0
+ movlpd %xmm0, 6(%edx)
+# ifdef USE_AS_STPCPY
+ lea 13(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $14, %ebx
+ lea 14(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail16):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 8(%ecx), %xmm0
+ movlpd %xmm0, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 15(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ lea 16(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+# ifdef USE_AS_STRNCPY
+L(StrncpyLeaveCase2OrCase3):
+ test %eax, %eax
+ jnz L(Aligned64LeaveCase2)
+
+L(Aligned64LeaveCase3):
+ add $48, %ebx
+ jle L(CopyFrom1To16BytesCase3)
+ movaps %xmm4, -64(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase3)
+ movaps %xmm5, -48(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase3)
+ movaps %xmm6, -32(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(Aligned64LeaveCase2):
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ add $48, %ebx
+ jle L(CopyFrom1To16BytesCase2OrCase3)
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm4, -64(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm5, -48(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm6, -32(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+ jmp L(CopyFrom1To16BytesCase2)
+/* -------------------------------------------------- */
+L(StrncpyExit1Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $15, %xmm6
+ mov $15, %esi
+ palignr $1, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit2Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $14, %xmm6
+ mov $14, %esi
+ palignr $2, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit3Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $13, %xmm6
+ mov $13, %esi
+ palignr $3, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit4Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $12, %xmm6
+ mov $12, %esi
+ palignr $4, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit5Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $11, %xmm6
+ mov $11, %esi
+ palignr $5, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit6Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $10, %xmm6
+ mov $10, %esi
+ palignr $6, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit7Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $9, %xmm6
+ mov $9, %esi
+ palignr $7, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit8Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $8, %xmm6
+ mov $8, %esi
+ palignr $8, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit9Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $7, %xmm6
+ mov $7, %esi
+ palignr $9, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit10Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $6, %xmm6
+ mov $6, %esi
+ palignr $10, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit11Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $5, %xmm6
+ mov $5, %esi
+ palignr $11, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit12Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $4, %xmm6
+ mov $4, %esi
+ palignr $12, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit13Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $3, %xmm6
+ mov $3, %esi
+ palignr $13, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit14Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $2, %xmm6
+ mov $2, %esi
+ palignr $14, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit15Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $1, %xmm6
+ mov $1, %esi
+ palignr $15, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave1):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit1)
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 31(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit1)
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 31+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit1)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit1)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit1):
+ movaps (%edx, %esi), %xmm6
+ psrldq $15, %xmm6
+ palignr $1, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 15(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave2):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit2)
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 30(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit2)
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 30+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit2)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit2)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit2):
+ movaps (%edx, %esi), %xmm6
+ psrldq $14, %xmm6
+ palignr $2, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 14(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave3):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit3)
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 29(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit3)
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 29+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit3)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit3)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit3):
+ movaps (%edx, %esi), %xmm6
+ psrldq $13, %xmm6
+ palignr $3, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 13(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave4):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit4)
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit4)
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 28+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit4)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit4)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit4):
+ movaps (%edx, %esi), %xmm6
+ psrldq $12, %xmm6
+ palignr $4, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 12(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave5):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit5)
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 27(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit5)
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 27+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit5)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit5)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit5):
+ movaps (%edx, %esi), %xmm6
+ psrldq $11, %xmm6
+ palignr $5, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 11(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave6):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit6)
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 26(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit6)
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 26+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit6)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit6)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit6):
+ movaps (%edx, %esi), %xmm6
+ psrldq $10, %xmm6
+ palignr $6, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 10(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave7):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit7)
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 25(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit7)
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 25+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit7)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit7)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit7):
+ movaps (%edx, %esi), %xmm6
+ psrldq $9, %xmm6
+ palignr $7, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 9(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave8):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit8)
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit8)
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 24+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit8)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit8)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit8):
+ movaps (%edx, %esi), %xmm6
+ psrldq $8, %xmm6
+ palignr $8, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 8(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave9):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit9)
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 23(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit9)
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 23+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit9)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit9)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit9):
+ movaps (%edx, %esi), %xmm6
+ psrldq $7, %xmm6
+ palignr $9, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 7(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave10):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit10)
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 22(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit10)
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 22+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit10)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit10)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit10):
+ movaps (%edx, %esi), %xmm6
+ psrldq $6, %xmm6
+ palignr $10, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 6(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave11):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit11)
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 21(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit11)
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 21+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit11)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit11)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit11):
+ movaps (%edx, %esi), %xmm6
+ psrldq $5, %xmm6
+ palignr $11, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 5(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave12):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit12)
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit12)
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 20+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit12)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit12)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit12):
+ movaps (%edx, %esi), %xmm6
+ psrldq $4, %xmm6
+ palignr $12, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 4(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave13):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit13)
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 19(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit13)
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 19+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit13)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit13)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit13):
+ movaps (%edx, %esi), %xmm6
+ psrldq $3, %xmm6
+ palignr $13, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 3(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave14):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit14)
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 18(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit14)
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 18+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit14)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit14)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit14):
+ movaps (%edx, %esi), %xmm6
+ psrldq $2, %xmm6
+ palignr $14, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 2(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave15):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit15)
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 17(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit15)
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 17+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit15)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit15)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit15):
+ movaps (%edx, %esi), %xmm6
+ psrldq $1, %xmm6
+ palignr $15, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 1(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(ExitTail0):
+ movl %edx, %eax
+ RETURN
+
+ .p2align 4
+L(StrncpyExit15Bytes):
+ cmp $9, %ebx
+ je L(ExitTail9)
+ cmpb $0, 8(%ecx)
+ jz L(ExitTail9)
+ cmp $10, %ebx
+ je L(ExitTail10)
+ cmpb $0, 9(%ecx)
+ jz L(ExitTail10)
+ cmp $11, %ebx
+ je L(ExitTail11)
+ cmpb $0, 10(%ecx)
+ jz L(ExitTail11)
+ cmp $12, %ebx
+ je L(ExitTail12)
+ cmpb $0, 11(%ecx)
+ jz L(ExitTail12)
+ cmp $13, %ebx
+ je L(ExitTail13)
+ cmpb $0, 12(%ecx)
+ jz L(ExitTail13)
+ cmp $14, %ebx
+ je L(ExitTail14)
+ cmpb $0, 13(%ecx)
+ jz L(ExitTail14)
+# endif
+
+ .p2align 4
+L(ExitTail15):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 14(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $15, %ebx
+ lea 15(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+# ifdef USE_AS_STRNCPY
+ .p2align 4
+L(StrncpyExit8Bytes):
+ cmp $1, %ebx
+ je L(ExitTail1)
+ cmpb $0, (%ecx)
+ jz L(ExitTail1)
+ cmp $2, %ebx
+ je L(ExitTail2)
+ cmpb $0, 1(%ecx)
+ jz L(ExitTail2)
+ cmp $3, %ebx
+ je L(ExitTail3)
+ cmpb $0, 2(%ecx)
+ jz L(ExitTail3)
+ cmp $4, %ebx
+ je L(ExitTail4)
+ cmpb $0, 3(%ecx)
+ jz L(ExitTail4)
+ cmp $5, %ebx
+ je L(ExitTail5)
+ cmpb $0, 4(%ecx)
+ jz L(ExitTail5)
+ cmp $6, %ebx
+ je L(ExitTail6)
+ cmpb $0, 5(%ecx)
+ jz L(ExitTail6)
+ cmp $7, %ebx
+ je L(ExitTail7)
+ cmpb $0, 6(%ecx)
+ jz L(ExitTail7)
+# endif
+ .p2align 4
+L(ExitTail8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 7(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $8, %ebx
+ lea 8(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+END (STRCPY)
+
+#endif