aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/x86_64
diff options
context:
space:
mode:
authorLiubov Dmitrieva <liubov.dmitrieva@gmail.com>2011-10-15 11:10:08 -0400
committerUlrich Drepper <drepper@gmail.com>2011-10-15 11:10:08 -0400
commitbe13f7bff66e1850f9057dd813d6e7be022d9516 (patch)
treed918a146db9072ad120f0010481c53d9b450c9a5 /sysdeps/x86_64
parent556a2007974ed39a68c87a8b5181f8057ecd0d6f (diff)
downloadglibc-be13f7bff66e1850f9057dd813d6e7be022d9516.tar
glibc-be13f7bff66e1850f9057dd813d6e7be022d9516.tar.gz
glibc-be13f7bff66e1850f9057dd813d6e7be022d9516.tar.bz2
glibc-be13f7bff66e1850f9057dd813d6e7be022d9516.zip
Optimized memcmp and wmemcmp for x86-64 and x86-32
Diffstat (limited to 'sysdeps/x86_64')
-rw-r--r--sysdeps/x86_64/multiarch/Makefile3
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-sse4.S192
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-ssse3.S1997
-rw-r--r--sysdeps/x86_64/multiarch/memcmp.S19
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp-c.c5
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp-sse4.S4
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp-ssse3.S4
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp.S47
8 files changed, 2242 insertions, 29 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index a5254dc93c..e0bb9847a8 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -15,7 +15,8 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
- strrchr-sse2-no-bsf strchr-sse2-no-bsf
+ strrchr-sse2-no-bsf strchr-sse2-no-bsf \
+ memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
index fc439bb013..28dd505d99 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -1,5 +1,5 @@
-/* memcmp with SSE4.1
- Copyright (C) 2010 Free Software Foundation, Inc.
+/* memcmp with SSE4.1, wmemcmp with SSE4.1
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -20,43 +20,54 @@
#ifndef NOT_IN_libc
-#include <sysdep.h>
-#include "asm-syntax.h"
+# include <sysdep.h>
-#ifndef MEMCMP
-# define MEMCMP __memcmp_sse4_1
-#endif
+# ifndef MEMCMP
+# define MEMCMP __memcmp_sse4_1
+# endif
-#ifndef ALIGN
-# define ALIGN(n) .p2align n
-#endif
+# ifndef ALIGN
+# define ALIGN(n) .p2align n
+# endif
-#define JMPTBL(I, B) (I - B)
+# define JMPTBL(I, B) (I - B)
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
lea TABLE(%rip), %r11; \
movslq (%r11, INDEX, SCALE), %rcx; \
add %r11, %rcx; \
jmp *%rcx; \
ud2
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
.section .text.sse4.1,"ax",@progbits
ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+ shl $2, %rdx
+# endif
pxor %xmm0, %xmm0
cmp $79, %rdx
ja L(79bytesormore)
+# ifndef USE_AS_WMEMCMP
cmp $1, %rdx
je L(firstbyte)
+# endif
add %rdx, %rsi
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+# ifndef USE_AS_WMEMCMP
ALIGN (4)
L(firstbyte):
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
sub %ecx, %eax
ret
+# endif
ALIGN (4)
L(79bytesormore):
@@ -308,11 +319,11 @@ L(less32bytesin256):
ALIGN (4)
L(512bytesormore):
-#ifdef DATA_CACHE_SIZE_HALF
+# ifdef DATA_CACHE_SIZE_HALF
mov $DATA_CACHE_SIZE_HALF, %r8
-#else
+# else
mov __x86_64_data_cache_size_half(%rip), %r8
-#endif
+# endif
mov %r8, %r9
shr $1, %r8
add %r9, %r8
@@ -624,11 +635,11 @@ L(less32bytesin256in2alinged):
ALIGN (4)
L(512bytesormorein2aligned):
-#ifdef DATA_CACHE_SIZE_HALF
+# ifdef DATA_CACHE_SIZE_HALF
mov $DATA_CACHE_SIZE_HALF, %r8
-#else
+# else
mov __x86_64_data_cache_size_half(%rip), %r8
-#endif
+# endif
mov %r8, %r9
shr $1, %r8
add %r9, %r8
@@ -667,6 +678,7 @@ L(64bytesormore_loopin2aligned):
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
L(L2_L3_cache_aglined):
sub $64, %rdx
+
ALIGN (4)
L(L2_L3_aligned_128bytes_loop):
prefetchnta 0x1c0(%rdi)
@@ -803,13 +815,19 @@ L(12bytes):
jne L(diffin8bytes)
L(4bytes):
mov -4(%rsi), %ecx
+# ifndef USE_AS_WMEMCMP
mov -4(%rdi), %eax
cmp %eax, %ecx
+# else
+ cmp -4(%rdi), %ecx
+# endif
jne L(diffin4bytes)
L(0bytes):
xor %eax, %eax
ret
+# ifndef USE_AS_WMEMCMP
+/* unreal case for wmemcmp */
ALIGN (4)
L(65bytes):
movdqu -65(%rdi), %xmm1
@@ -1017,6 +1035,7 @@ L(1bytes):
movzbl -1(%rsi), %ecx
sub %ecx, %eax
ret
+# endif
ALIGN (4)
L(68bytes):
@@ -1047,13 +1066,20 @@ L(20bytes):
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
jnc L(less16bytes)
- mov -4(%rdi), %eax
mov -4(%rsi), %ecx
+
+# ifndef USE_AS_WMEMCMP
+ mov -4(%rdi), %eax
cmp %eax, %ecx
+# else
+ cmp -4(%rdi), %ecx
+# endif
jne L(diffin4bytes)
xor %eax, %eax
ret
+# ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
ALIGN (4)
L(69bytes):
movdqu -69(%rsi), %xmm1
@@ -1161,6 +1187,7 @@ L(23bytes):
jne L(diffin8bytes)
xor %eax, %eax
ret
+# endif
ALIGN (4)
L(72bytes):
@@ -1191,13 +1218,16 @@ L(24bytes):
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
jnc L(less16bytes)
- mov -8(%rdi), %rax
+
mov -8(%rsi), %rcx
+ mov -8(%rdi), %rax
cmp %rax, %rcx
jne L(diffin8bytes)
xor %eax, %eax
ret
+# ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
ALIGN (4)
L(73bytes):
movdqu -73(%rsi), %xmm1
@@ -1312,7 +1342,7 @@ L(27bytes):
jne L(diffin4bytes)
xor %eax, %eax
ret
-
+# endif
ALIGN (4)
L(76bytes):
movdqu -76(%rsi), %xmm1
@@ -1346,13 +1376,19 @@ L(28bytes):
mov -12(%rsi), %rcx
cmp %rax, %rcx
jne L(diffin8bytes)
- mov -4(%rdi), %eax
mov -4(%rsi), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -4(%rdi), %eax
cmp %eax, %ecx
+# else
+ cmp -4(%rdi), %ecx
+# endif
jne L(diffin4bytes)
xor %eax, %eax
ret
+# ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
ALIGN (4)
L(77bytes):
movdqu -77(%rsi), %xmm1
@@ -1474,7 +1510,7 @@ L(31bytes):
jne L(diffin8bytes)
xor %eax, %eax
ret
-
+# endif
ALIGN (4)
L(64bytes):
movdqu -64(%rdi), %xmm2
@@ -1527,7 +1563,17 @@ L(diffin8bytes):
jne L(diffin4bytes)
shr $32, %rcx
shr $32, %rax
+
+# ifdef USE_AS_WMEMCMP
+/* for wmemcmp */
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+# endif
+
L(diffin4bytes):
+# ifndef USE_AS_WMEMCMP
cmp %cx, %ax
jne L(diffin2bytes)
shr $16, %ecx
@@ -1546,11 +1592,28 @@ L(end):
and $0xff, %ecx
sub %ecx, %eax
ret
+# else
+
+/* for wmemcmp */
+ mov $1, %eax
+ jl L(nequal_bigger)
+ neg %eax
+ ret
+
+ ALIGN (4)
+L(nequal_bigger):
+ ret
+
+L(unreal_case):
+ xor %eax, %eax
+ ret
+# endif
END (MEMCMP)
.section .rodata.sse4.1,"a",@progbits
ALIGN (3)
+# ifndef USE_AS_WMEMCMP
L(table_64bytes):
.int JMPTBL (L(0bytes), L(table_64bytes))
.int JMPTBL (L(1bytes), L(table_64bytes))
@@ -1632,4 +1695,87 @@ L(table_64bytes):
.int JMPTBL (L(77bytes), L(table_64bytes))
.int JMPTBL (L(78bytes), L(table_64bytes))
.int JMPTBL (L(79bytes), L(table_64bytes))
+# else
+L(table_64bytes):
+ .int JMPTBL (L(0bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(4bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(8bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(12bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(16bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(20bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(24bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(28bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(32bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(36bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(40bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(44bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(48bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(52bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(56bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(60bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(64bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(68bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(72bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(76bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+# endif
#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
new file mode 100644
index 0000000000..b3a2ca1edd
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
@@ -0,0 +1,1997 @@
+/* memcmp with SSSE3, wmemcmp with SSSE3
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+# define MEMCMP __memcmp_ssse3
+# endif
+
+# ifndef ALIGN
+# define ALIGN(n) .p2align n
+# endif
+
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+ atom_text_section
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+ shl $2, %rdx
+ test %rdx, %rdx
+ jz L(equal)
+# endif
+ mov %rdx, %rcx
+ mov %rdi, %rdx
+ cmp $48, %rcx;
+ jae L(48bytesormore) /* LEN => 48 */
+
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+/* ECX >= 32. */
+L(48bytesormore):
+ movdqu (%rdi), %xmm3
+ movdqu (%rsi), %xmm0
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 16(%rdi), %rdi
+ lea 16(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(less16bytes)
+ mov %edi, %edx
+ and $0xf, %edx
+ xor %rdx, %rdi
+ sub %rdx, %rsi
+ add %rdx, %rcx
+ mov %esi, %edx
+ and $0xf, %edx
+ jz L(shr_0)
+ xor %rdx, %rsi
+
+# ifndef USE_AS_WMEMCMP
+ cmp $8, %edx
+ jae L(next_unaligned_table)
+ cmp $0, %edx
+ je L(shr_0)
+ cmp $1, %edx
+ je L(shr_1)
+ cmp $2, %edx
+ je L(shr_2)
+ cmp $3, %edx
+ je L(shr_3)
+ cmp $4, %edx
+ je L(shr_4)
+ cmp $5, %edx
+ je L(shr_5)
+ cmp $6, %edx
+ je L(shr_6)
+ jmp L(shr_7)
+
+ ALIGN (2)
+L(next_unaligned_table):
+ cmp $8, %edx
+ je L(shr_8)
+ cmp $9, %edx
+ je L(shr_9)
+ cmp $10, %edx
+ je L(shr_10)
+ cmp $11, %edx
+ je L(shr_11)
+ cmp $12, %edx
+ je L(shr_12)
+ cmp $13, %edx
+ je L(shr_13)
+ cmp $14, %edx
+ je L(shr_14)
+ jmp L(shr_15)
+# else
+ cmp $0, %edx
+ je L(shr_0)
+ cmp $4, %edx
+ je L(shr_4)
+ cmp $8, %edx
+ je L(shr_8)
+ jmp L(shr_12)
+# endif
+
+ ALIGN (4)
+L(shr_0):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ jae L(shr_0_gobble)
+ xor %eax, %eax
+ movdqa (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+ movdqa 16(%rsi), %xmm2
+ pcmpeqb 16(%rdi), %xmm2
+ pand %xmm1, %xmm2
+ pmovmskb %xmm2, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_0_gobble):
+ movdqa (%rsi), %xmm0
+ xor %eax, %eax
+ pcmpeqb (%rdi), %xmm0
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm2
+ pcmpeqb 16(%rdi), %xmm2
+L(shr_0_gobble_loop):
+ pand %xmm0, %xmm2
+ sub $32, %rcx
+ pmovmskb %xmm2, %edx
+ movdqa %xmm0, %xmm1
+ movdqa 32(%rsi), %xmm0
+ movdqa 48(%rsi), %xmm2
+ sbb $0xffff, %edx
+ pcmpeqb 32(%rdi), %xmm0
+ pcmpeqb 48(%rdi), %xmm2
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ jz L(shr_0_gobble_loop)
+
+ pand %xmm0, %xmm2
+ cmp $0, %rcx
+ jge L(next)
+ inc %edx
+ add $32, %rcx
+L(next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm2, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+ ALIGN (4)
+L(shr_1):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_1_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $1, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $1, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $1, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_1_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $1, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $1, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_1_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $1, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $1, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_1_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_1_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_1_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 1(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+
+ ALIGN (4)
+L(shr_2):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_2_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $2, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $2, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $2, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_2_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $2, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $2, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_2_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $2, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $2, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_2_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_2_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_2_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 2(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_3):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_3_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $3, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $3, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $3, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_3_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $3, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $3, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_3_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $3, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $3, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_3_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_3_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_3_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 3(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+# endif
+
+ ALIGN (4)
+L(shr_4):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_4_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $4, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $4, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $4, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_4_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $4, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $4, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_4_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $4, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $4, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_4_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_4_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_4_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 4(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+ ALIGN (4)
+L(shr_5):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_5_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $5, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $5, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $5, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_5_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $5, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $5, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_5_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $5, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $5, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_5_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_5_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_5_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 5(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_6):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_6_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $6, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $6, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $6, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_6_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $6, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $6, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_6_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $6, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $6, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_6_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_6_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_6_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 6(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_7):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_7_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $7, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $7, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $7, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_7_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $7, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $7, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_7_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $7, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $7, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_7_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_7_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_7_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 7(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+# endif
+
+ ALIGN (4)
+L(shr_8):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_8_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $8, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $8, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $8, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_8_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $8, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $8, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_8_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $8, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $8, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_8_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_8_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_8_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 8(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+ ALIGN (4)
+L(shr_9):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_9_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $9, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $9, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $9, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_9_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $9, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $9, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_9_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $9, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $9, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_9_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_9_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_9_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 9(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_10):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_10_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $10, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $10, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $10, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_10_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $10, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $10, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_10_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $10, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $10, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_10_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_10_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_10_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 10(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_11):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_11_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $11, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $11, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $11, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_11_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $11, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $11, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_11_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $11, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $11, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_11_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_11_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_11_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 11(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+# endif
+
+ ALIGN (4)
+L(shr_12):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_12_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $12, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $12, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $12, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_12_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $12, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $12, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_12_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $12, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $12, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_12_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_12_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_12_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 12(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+ ALIGN (4)
+L(shr_13):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_13_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $13, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $13, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $13, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_13_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $13, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $13, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_13_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $13, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $13, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_13_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_13_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_13_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 13(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_14):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_14_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $14, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $14, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $14, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_14_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $14, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $14, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_14_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $14, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $14, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_14_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_14_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_14_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 14(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_15):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_15_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $15, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $15, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $15, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_15_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $15, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $15, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_15_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $15, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $15, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_15_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_15_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_15_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 15(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+# endif
+ ALIGN (4)
+L(exit):
+ pmovmskb %xmm1, %r8d
+ sub $0xffff, %r8d
+ jz L(first16bytes)
+ lea -16(%rsi), %rsi
+ lea -16(%rdi), %rdi
+ mov %r8d, %edx
+L(first16bytes):
+ add %rax, %rsi
+L(less16bytes):
+# ifndef USE_AS_WMEMCMP
+ test %dl, %dl
+ jz L(next_24_bytes)
+
+ test $0x01, %dl
+ jnz L(Byte16)
+
+ test $0x02, %dl
+ jnz L(Byte17)
+
+ test $0x04, %dl
+ jnz L(Byte18)
+
+ test $0x08, %dl
+ jnz L(Byte19)
+
+ test $0x10, %dl
+ jnz L(Byte20)
+
+ test $0x20, %dl
+ jnz L(Byte21)
+
+ test $0x40, %dl
+ jnz L(Byte22)
+
+ movzbl -9(%rdi), %eax
+ movzbl -9(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(Byte16):
+ movzbl -16(%rdi), %eax
+ movzbl -16(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(Byte17):
+ movzbl -15(%rdi), %eax
+ movzbl -15(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(Byte18):
+ movzbl -14(%rdi), %eax
+ movzbl -14(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(Byte19):
+ movzbl -13(%rdi), %eax
+ movzbl -13(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(Byte20):
+ movzbl -12(%rdi), %eax
+ movzbl -12(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(Byte21):
+ movzbl -11(%rdi), %eax
+ movzbl -11(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(Byte22):
+ movzbl -10(%rdi), %eax
+ movzbl -10(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(next_24_bytes):
+ lea 8(%rdi), %rdi
+ lea 8(%rsi), %rsi
+ test $0x01, %dh
+ jnz L(Byte16)
+
+ test $0x02, %dh
+ jnz L(Byte17)
+
+ test $0x04, %dh
+ jnz L(Byte18)
+
+ test $0x08, %dh
+ jnz L(Byte19)
+
+ test $0x10, %dh
+ jnz L(Byte20)
+
+ test $0x20, %dh
+ jnz L(Byte21)
+
+ test $0x40, %dh
+ jnz L(Byte22)
+
+ mov -9(%rdi), %eax
+ and $0xff, %eax
+ mov -9(%rsi), %edx
+ and $0xff, %edx
+ sub %edx, %eax
+ ret
+# else
+/* special for wmemcmp */
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words)
+ and $15, %dl
+ jz L(second_double_word)
+ mov -16(%rdi), %eax
+ cmp -16(%rsi), %eax
+ jne L(find_diff)
+ ret
+
+ ALIGN (4)
+L(second_double_word):
+ mov -12(%rdi), %eax
+ cmp -12(%rsi), %eax
+ jne L(find_diff)
+ ret
+
+ ALIGN (4)
+L(next_two_double_words):
+ and $15, %dh
+ jz L(fourth_double_word)
+ mov -8(%rdi), %eax
+ cmp -8(%rsi), %eax
+ jne L(find_diff)
+ ret
+
+ ALIGN (4)
+L(fourth_double_word):
+ mov -4(%rdi), %eax
+ cmp -4(%rsi), %eax
+ jne L(find_diff)
+ ret
+# endif
+
+ ALIGN (4)
+L(less48bytes):
+ cmp $8, %ecx
+ jae L(more8bytes)
+ cmp $0, %ecx
+ je L(0bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $1, %ecx
+ je L(1bytes)
+ cmp $2, %ecx
+ je L(2bytes)
+ cmp $3, %ecx
+ je L(3bytes)
+ cmp $4, %ecx
+ je L(4bytes)
+ cmp $5, %ecx
+ je L(5bytes)
+ cmp $6, %ecx
+ je L(6bytes)
+ jmp L(7bytes)
+# else
+ jmp L(4bytes)
+# endif
+
+ ALIGN (4)
+L(more8bytes):
+ cmp $16, %ecx
+ jae L(more16bytes)
+ cmp $8, %ecx
+ je L(8bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $9, %ecx
+ je L(9bytes)
+ cmp $10, %ecx
+ je L(10bytes)
+ cmp $11, %ecx
+ je L(11bytes)
+ cmp $12, %ecx
+ je L(12bytes)
+ cmp $13, %ecx
+ je L(13bytes)
+ cmp $14, %ecx
+ je L(14bytes)
+ jmp L(15bytes)
+# else
+ jmp L(12bytes)
+# endif
+
+ ALIGN (4)
+L(more16bytes):
+ cmp $24, %ecx
+ jae L(more24bytes)
+ cmp $16, %ecx
+ je L(16bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $17, %ecx
+ je L(17bytes)
+ cmp $18, %ecx
+ je L(18bytes)
+ cmp $19, %ecx
+ je L(19bytes)
+ cmp $20, %ecx
+ je L(20bytes)
+ cmp $21, %ecx
+ je L(21bytes)
+ cmp $22, %ecx
+ je L(22bytes)
+ jmp L(23bytes)
+# else
+ jmp L(20bytes)
+# endif
+
+ ALIGN (4)
+L(more24bytes):
+ cmp $32, %ecx
+ jae L(more32bytes)
+ cmp $24, %ecx
+ je L(24bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $25, %ecx
+ je L(25bytes)
+ cmp $26, %ecx
+ je L(26bytes)
+ cmp $27, %ecx
+ je L(27bytes)
+ cmp $28, %ecx
+ je L(28bytes)
+ cmp $29, %ecx
+ je L(29bytes)
+ cmp $30, %ecx
+ je L(30bytes)
+ jmp L(31bytes)
+# else
+ jmp L(28bytes)
+# endif
+
+ ALIGN (4)
+L(more32bytes):
+ cmp $40, %ecx
+ jae L(more40bytes)
+ cmp $32, %ecx
+ je L(32bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $33, %ecx
+ je L(33bytes)
+ cmp $34, %ecx
+ je L(34bytes)
+ cmp $35, %ecx
+ je L(35bytes)
+ cmp $36, %ecx
+ je L(36bytes)
+ cmp $37, %ecx
+ je L(37bytes)
+ cmp $38, %ecx
+ je L(38bytes)
+ jmp L(39bytes)
+# else
+ jmp L(36bytes)
+# endif
+
+ ALIGN (4)
+L(more40bytes):
+ cmp $40, %ecx
+ je L(40bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $41, %ecx
+ je L(41bytes)
+ cmp $42, %ecx
+ je L(42bytes)
+ cmp $43, %ecx
+ je L(43bytes)
+ cmp $44, %ecx
+ je L(44bytes)
+ cmp $45, %ecx
+ je L(45bytes)
+ cmp $46, %ecx
+ je L(46bytes)
+ jmp L(47bytes)
+
+ ALIGN (4)
+L(44bytes):
+ movl -44(%rdi), %eax
+ movl -44(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(40bytes):
+ movl -40(%rdi), %eax
+ movl -40(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(36bytes):
+ movl -36(%rdi), %eax
+ movl -36(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(32bytes):
+ movl -32(%rdi), %eax
+ movl -32(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(28bytes):
+ movl -28(%rdi), %eax
+ movl -28(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(24bytes):
+ movl -24(%rdi), %eax
+ movl -24(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(20bytes):
+ movl -20(%rdi), %eax
+ movl -20(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(16bytes):
+ movl -16(%rdi), %eax
+ movl -16(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(12bytes):
+ movl -12(%rdi), %eax
+ movl -12(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(8bytes):
+ movl -8(%rdi), %eax
+ movl -8(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(4bytes):
+ movl -4(%rdi), %eax
+ movl -4(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(0bytes):
+ xor %eax, %eax
+ ret
+# else
+ ALIGN (4)
+L(44bytes):
+ movl -44(%rdi), %eax
+ cmp -44(%rsi), %eax
+ jne L(find_diff)
+L(40bytes):
+ movl -40(%rdi), %eax
+ cmp -40(%rsi), %eax
+ jne L(find_diff)
+L(36bytes):
+ movl -36(%rdi), %eax
+ cmp -36(%rsi), %eax
+ jne L(find_diff)
+L(32bytes):
+ movl -32(%rdi), %eax
+ cmp -32(%rsi), %eax
+ jne L(find_diff)
+L(28bytes):
+ movl -28(%rdi), %eax
+ cmp -28(%rsi), %eax
+ jne L(find_diff)
+L(24bytes):
+ movl -24(%rdi), %eax
+ cmp -24(%rsi), %eax
+ jne L(find_diff)
+L(20bytes):
+ movl -20(%rdi), %eax
+ cmp -20(%rsi), %eax
+ jne L(find_diff)
+L(16bytes):
+ movl -16(%rdi), %eax
+ cmp -16(%rsi), %eax
+ jne L(find_diff)
+L(12bytes):
+ movl -12(%rdi), %eax
+ cmp -12(%rsi), %eax
+ jne L(find_diff)
+L(8bytes):
+ movl -8(%rdi), %eax
+ cmp -8(%rsi), %eax
+ jne L(find_diff)
+L(4bytes):
+ movl -4(%rdi), %eax
+ cmp -4(%rsi), %eax
+ jne L(find_diff)
+L(0bytes):
+ xor %eax, %eax
+ ret
+# endif
+
+# ifndef USE_AS_WMEMCMP
+ ALIGN (4)
+L(45bytes):
+ movl -45(%rdi), %eax
+ movl -45(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(41bytes):
+ movl -41(%rdi), %eax
+ movl -41(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(37bytes):
+ movl -37(%rdi), %eax
+ movl -37(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(33bytes):
+ movl -33(%rdi), %eax
+ movl -33(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(29bytes):
+ movl -29(%rdi), %eax
+ movl -29(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(25bytes):
+ movl -25(%rdi), %eax
+ movl -25(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(21bytes):
+ movl -21(%rdi), %eax
+ movl -21(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(17bytes):
+ movl -17(%rdi), %eax
+ movl -17(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(13bytes):
+ movl -13(%rdi), %eax
+ movl -13(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(9bytes):
+ movl -9(%rdi), %eax
+ movl -9(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(5bytes):
+ movl -5(%rdi), %eax
+ movl -5(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(1bytes):
+ movzbl -1(%rdi), %eax
+ cmpb -1(%rsi), %al
+ jne L(set)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(46bytes):
+ movl -46(%rdi), %eax
+ movl -46(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(42bytes):
+ movl -42(%rdi), %eax
+ movl -42(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(38bytes):
+ movl -38(%rdi), %eax
+ movl -38(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(34bytes):
+ movl -34(%rdi), %eax
+ movl -34(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(30bytes):
+ movl -30(%rdi), %eax
+ movl -30(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(26bytes):
+ movl -26(%rdi), %eax
+ movl -26(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(22bytes):
+ movl -22(%rdi), %eax
+ movl -22(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(18bytes):
+ movl -18(%rdi), %eax
+ movl -18(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(14bytes):
+ movl -14(%rdi), %eax
+ movl -14(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(10bytes):
+ movl -10(%rdi), %eax
+ movl -10(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(6bytes):
+ movl -6(%rdi), %eax
+ movl -6(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(2bytes):
+ movzwl -2(%rdi), %eax
+ movzwl -2(%rsi), %ecx
+ cmpb %cl, %al
+ jne L(set)
+ cmp %ecx, %eax
+ jne L(set)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(47bytes):
+ movl -47(%rdi), %eax
+ movl -47(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(43bytes):
+ movl -43(%rdi), %eax
+ movl -43(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(39bytes):
+ movl -39(%rdi), %eax
+ movl -39(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(35bytes):
+ movl -35(%rdi), %eax
+ movl -35(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(31bytes):
+ movl -31(%rdi), %eax
+ movl -31(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(27bytes):
+ movl -27(%rdi), %eax
+ movl -27(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(23bytes):
+ movl -23(%rdi), %eax
+ movl -23(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(19bytes):
+ movl -19(%rdi), %eax
+ movl -19(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(15bytes):
+ movl -15(%rdi), %eax
+ movl -15(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(11bytes):
+ movl -11(%rdi), %eax
+ movl -11(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(7bytes):
+ movl -7(%rdi), %eax
+ movl -7(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(3bytes):
+ movzwl -3(%rdi), %eax
+ movzwl -3(%rsi), %ecx
+ cmpb %cl, %al
+ jne L(set)
+ cmp %ecx, %eax
+ jne L(set)
+ movzbl -1(%rdi), %eax
+ cmpb -1(%rsi), %al
+ jne L(set)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(find_diff):
+ cmpb %cl, %al
+ jne L(set)
+ cmpw %cx, %ax
+ jne L(set)
+ shr $16, %eax
+ shr $16, %ecx
+ cmpb %cl, %al
+ jne L(set)
+
+/* We get there only if we already know there is a
+difference. */
+
+ cmp %ecx, %eax
+L(set):
+ sbb %eax, %eax
+ sbb $-1, %eax
+ ret
+# else
+
+/* for wmemcmp */
+ ALIGN (4)
+L(find_diff):
+ mov $1, %eax
+ jg L(find_diff_bigger)
+ neg %eax
+ ret
+
+ ALIGN (4)
+L(find_diff_bigger):
+ ret
+# endif
+
+ ALIGN (4)
+L(equal):
+ xor %eax, %eax
+ ret
+
+END (MEMCMP)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
index 301ab287f5..8bf8f3a417 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/x86_64/multiarch/memcmp.S
@@ -1,5 +1,5 @@
/* Multiple versions of memcmp
- Copyright (C) 2010 Free Software Foundation, Inc.
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -29,11 +29,20 @@ ENTRY(memcmp)
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
jne 1f
call __init_cpu_features
-1: leaq __memcmp_sse2(%rip), %rax
- testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
- jz 2f
+
+1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ jnz 2f
+ leaq __memcmp_sse2(%rip), %rax
+ ret
+
+2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+ jz 3f
leaq __memcmp_sse4_1(%rip), %rax
-2: ret
+ ret
+
+3: leaq __memcmp_ssse3(%rip), %rax
+ ret
+
END(memcmp)
# undef ENTRY
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-c.c b/sysdeps/x86_64/multiarch/wmemcmp-c.c
new file mode 100644
index 0000000000..793f059aff
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-c.c
@@ -0,0 +1,5 @@
+#ifndef NOT_IN_libc
+# define WMEMCMP __wmemcmp_sse2
+#endif
+
+#include "wcsmbs/wmemcmp.c"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
new file mode 100644
index 0000000000..b07973a4f6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_sse4_1
+
+#include "memcmp-sse4.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
new file mode 100644
index 0000000000..a41ef95fc1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_ssse3
+
+#include "memcmp-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S
new file mode 100644
index 0000000000..7c3b7ed178
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp.S
@@ -0,0 +1,47 @@
+/* Multiple versions of wmemcmp
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#ifndef NOT_IN_libc
+ .text
+ENTRY(wmemcmp)
+ .type wmemcmp, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+
+1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ jnz 2f
+ leaq __wmemcmp_sse2(%rip), %rax
+ ret
+
+2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+ jz 3f
+ leaq __wmemcmp_sse4_1(%rip), %rax
+ ret
+
+3: leaq __wmemcmp_ssse3(%rip), %rax
+ ret
+
+END(wmemcmp)
+#endif