aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLiubov Dmitrieva <liubov.dmitrieva@gmail.com>2011-10-15 11:10:08 -0400
committerUlrich Drepper <drepper@gmail.com>2011-10-15 11:10:08 -0400
commitbe13f7bff66e1850f9057dd813d6e7be022d9516 (patch)
treed918a146db9072ad120f0010481c53d9b450c9a5
parent556a2007974ed39a68c87a8b5181f8057ecd0d6f (diff)
downloadglibc-be13f7bff66e1850f9057dd813d6e7be022d9516.tar
glibc-be13f7bff66e1850f9057dd813d6e7be022d9516.tar.gz
glibc-be13f7bff66e1850f9057dd813d6e7be022d9516.tar.bz2
glibc-be13f7bff66e1850f9057dd813d6e7be022d9516.zip
Optimized memcmp and wmemcmp for x86-64 and x86-32
-rw-r--r--ChangeLog29
-rw-r--r--NEWS2
-rw-r--r--string/test-memcmp.c47
-rw-r--r--sysdeps/i386/i686/multiarch/Makefile3
-rw-r--r--sysdeps/i386/i686/multiarch/memcmp-sse4.S396
-rw-r--r--sysdeps/i386/i686/multiarch/memcmp-ssse3.S565
-rw-r--r--sysdeps/i386/i686/multiarch/wmemcmp-c.c5
-rw-r--r--sysdeps/i386/i686/multiarch/wmemcmp-sse4.S4
-rw-r--r--sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S4
-rw-r--r--sysdeps/i386/i686/multiarch/wmemcmp.S59
-rw-r--r--sysdeps/x86_64/multiarch/Makefile3
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-sse4.S192
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-ssse3.S1997
-rw-r--r--sysdeps/x86_64/multiarch/memcmp.S19
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp-c.c5
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp-sse4.S4
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp-ssse3.S4
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp.S47
-rw-r--r--wcsmbs/wmemcmp.c21
19 files changed, 3070 insertions, 336 deletions
diff --git a/ChangeLog b/ChangeLog
index 49f091aa83..414611a93c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,32 @@
+2011-09-27 Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
+
+ * sysdeps/x86_64/multiarch/Makefile: (sysdep_routines): Add
+ memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
+ * sysdeps/x86_64/multiarch/memcmp-ssse3: New file.
+ * sysdeps/x86_64/multiarch/memcmp.S: Update. Add __memcmp_ssse3.
+ * sysdeps/x86_64/multiarch/memcmp-sse4.S: Update.
+ (USE_AS_WMEMCMP): New macro.
+ Fixing indents.
+ * sysdeps/x86_64/multiarch/wmemcmp.S: New file.
+ * sysdeps/x86_64/multiarch/wmemcmp-ssse3.S: New file.
+ * sysdeps/x86_64/multiarch/wmemcmp-sse4.S: New file.
+ * sysdeps/x86_64/multiarch/wmemcmp-c.S: New file.
+ * sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add
+ wmemcmp-ssse3 wmemcmp-sse4 wmemcmp-c
+ * sysdeps/i386/i686/multiarch/wmemcmp.S: New file.
+ * sysdeps/i386/i686/multiarch/wmemcmp-c.c: New file.
+ * sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S: New file.
+ * sysdeps/i386/i686/multiarch/wmemcmp-sse4.S: New file.
+ * sysdeps/i386/i686/multiarch/memcmp-sse4.S: Update.
+ (USE_AS_WMEMCMP): New macro.
+ * sysdeps/i386/i686/multiarch/memcmp-ssse3: Likewise.
+ * sysdeps/string/test-memcmp.c: Update.
+ Fix simple_wmemcmp.
+ Add new tests.
+ * wcsmbs/wmemcmp.c: Update.
+ (WMEMCMP): New macro.
+ Fix overflow bug.
+
2011-10-12 Andreas Jaeger <aj@suse.de>
[BZ #13268]
diff --git a/NEWS b/NEWS
index 7e9b2c18a3..cdb2973028 100644
--- a/NEWS
+++ b/NEWS
@@ -33,7 +33,7 @@ Version 2.15
* Optimized strchr and strrchr for SSE on x86-32.
Contributed by Liubov Dmitrieva.
-* Optimized memchr, memrchr, rawmemchr for x86-64 and x86-32.
+* Optimized memchr, memrchr, rawmemchr, memcmp, wmemcmp for x86-64 and x86-32.
Contributed by Liubov Dmitrieva.
* New interfaces: scandirat, scandirat64
diff --git a/string/test-memcmp.c b/string/test-memcmp.c
index 4675bd9eaa..f246d3a78b 100644
--- a/string/test-memcmp.c
+++ b/string/test-memcmp.c
@@ -29,9 +29,21 @@
# define MEMCPY wmemcpy
# define SIMPLE_MEMCMP simple_wmemcmp
# define CHAR wchar_t
-# define MAX_CHAR 256000
-# define UCHAR uint32_t
+# define UCHAR wchar_t
# define CHARBYTES 4
+# define CHAR__MIN WCHAR_MIN
+# define CHAR__MAX WCHAR_MAX
+int
+simple_wmemcmp (const wchar_t *s1, const wchar_t *s2, size_t n)
+{
+ int ret = 0;
+ /* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+ */
+ while (n-- && (ret = *s1 < *s2 ? -1 : *s1 == *s2 ? 0 : 1) == 0) {s1++; s2++;}
+ return ret;
+}
#else
# define MEMCMP memcmp
# define MEMCPY memcpy
@@ -40,18 +52,20 @@
# define MAX_CHAR 255
# define UCHAR unsigned char
# define CHARBYTES 1
-#endif
-
-typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
+# define CHAR__MIN CHAR_MIN
+# define CHAR__MAX CHAR_MAX
int
-SIMPLE_MEMCMP (const CHAR *s1, const CHAR *s2, size_t n)
+simple_memcmp (const char *s1, const char *s2, size_t n)
{
int ret = 0;
- while (n-- && (ret = *(UCHAR *) s1++ - *(UCHAR *) s2++) == 0);
+ while (n-- && (ret = *(unsigned char *) s1++ - *(unsigned char *) s2++) == 0);
return ret;
}
+#endif
+
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
IMPL (SIMPLE_MEMCMP, 0)
IMPL (MEMCMP, 1)
@@ -121,7 +135,7 @@ do_test (size_t align1, size_t align2, size_t len, int exp_result)
s2 = (CHAR *) (buf2 + align2);
for (i = 0; i < len; i++)
- s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % MAX_CHAR;
+ s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % CHAR__MAX;
s1[len] = align1;
s2[len] = align2;
@@ -412,8 +426,8 @@ check1 (void)
s2[99] = 1;
s1[100] = 116;
s2[100] = 116;
- s1[101] = -13;
- s2[101] = -13;
+ s1[101] = CHAR__MIN;
+ s2[101] = CHAR__MAX;
s1[102] = -109;
s2[102] = -109;
s1[103] = 1;
@@ -434,8 +448,8 @@ check1 (void)
s2[110] = -109;
s1[111] = 1;
s2[111] = 1;
- s1[112] = 20;
- s2[112] = 20;
+ s1[112] = CHAR__MAX;
+ s2[112] = CHAR__MIN;
s1[113] = -13;
s2[113] = -13;
s1[114] = -109;
@@ -444,9 +458,12 @@ check1 (void)
s2[115] = 1;
n = 116;
- exp_result = SIMPLE_MEMCMP (s1, s2, n);
- FOR_EACH_IMPL (impl, 0)
- check_result (impl, s1, s2, n, exp_result);
+ for (size_t i = 0; i < n; i++)
+ {
+ exp_result = SIMPLE_MEMCMP (s1 + i, s2 + i, n - i);
+ FOR_EACH_IMPL (impl, 0)
+ check_result (impl, s1 + i, s2 + i, n - i, exp_result);
+ }
}
int
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index 8a4c2197b0..98d1ad6d54 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -17,7 +17,8 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \
wcscmp-sse2 wcscmp-c memchr-sse2 memchr-sse2-bsf \
memrchr-sse2 memrchr-sse2-bsf memrchr-c \
- rawmemchr-sse2 rawmemchr-sse2-bsf
+ rawmemchr-sse2 rawmemchr-sse2-bsf \
+ wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/sysdeps/i386/i686/multiarch/memcmp-sse4.S
index b1ed778f1f..1f5dbc15cb 100644
--- a/sysdeps/i386/i686/multiarch/memcmp-sse4.S
+++ b/sysdeps/i386/i686/multiarch/memcmp-sse4.S
@@ -1,5 +1,5 @@
-/* memcmp with SSE4.2
- Copyright (C) 2010 Free Software Foundation, Inc.
+/* memcmp with SSE4.2, wmemcmp with SSE4.2
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -20,84 +20,97 @@
#ifndef NOT_IN_libc
-#include <sysdep.h>
-#include "asm-syntax.h"
+# include <sysdep.h>
-#ifndef MEMCMP
-# define MEMCMP __memcmp_sse4_2
-#endif
+# ifndef MEMCMP
+# define MEMCMP __memcmp_sse4_2
+# endif
-#define CFI_PUSH(REG) \
- cfi_adjust_cfa_offset (4); \
- cfi_rel_offset (REG, 0)
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
-#define CFI_POP(REG) \
- cfi_adjust_cfa_offset (-4); \
- cfi_restore (REG)
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
-#define PUSH(REG) pushl REG; CFI_PUSH (REG)
-#define POP(REG) popl REG; CFI_POP (REG)
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
-#define PARMS 4
-#define BLK1 PARMS
-#define BLK2 BLK1+4
-#define LEN BLK2+4
-#define RETURN POP (%ebx); ret; CFI_PUSH (%ebx)
+# define PARMS 4
+# define BLK1 PARMS
+# define BLK2 BLK1 + 4
+# define LEN BLK2 + 4
+# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx)
-#ifdef SHARED
-# define JMPTBL(I, B) I - B
+# ifdef SHARED
+# define JMPTBL(I, B) I - B
/* Load an entry in a jump table into EBX and branch to it. TABLE is a
- jump table with relative offsets. INDEX is a register contains the
- index into the jump table. SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- /* We first load PC into EBX. */ \
- call __i686.get_pc_thunk.bx; \
- /* Get the address of the jump table. */ \
- addl $(TABLE - .), %ebx; \
- /* Get the entry and convert the relative offset to the \
- absolute address. */ \
- addl (%ebx,INDEX,SCALE), %ebx; \
- /* We loaded the jump table and adjuested EDX/ESI. Go. */ \
- jmp *%ebx
-
- .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
- .globl __i686.get_pc_thunk.bx
- .hidden __i686.get_pc_thunk.bx
- ALIGN (4)
- .type __i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
- movl (%esp), %ebx
- ret
-#else
-# define JMPTBL(I, B) I
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+/* We first load PC into EBX. */ \
+ call __i686.get_pc_thunk.bx; \
+/* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ebx; \
+/* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ebx,INDEX,SCALE), %ebx; \
+/* We loaded the jump table and adjuested EDX/ESI. Go. */ \
+ jmp *%ebx
+# else
+# define JMPTBL(I, B) I
/* Load an entry in a jump table into EBX and branch to it. TABLE is a
- jump table with relative offsets. INDEX is a register contains the
- index into the jump table. SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- jmp *TABLE(,INDEX,SCALE)
-#endif
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+# endif
+
+
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
.section .text.sse4.2,"ax",@progbits
ENTRY (MEMCMP)
movl BLK1(%esp), %eax
movl BLK2(%esp), %edx
movl LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+ shl $2, %ecx
+ test %ecx, %ecx
+ jz L(return0)
+# else
cmp $1, %ecx
jbe L(less1bytes)
+# endif
+
pxor %xmm0, %xmm0
cmp $64, %ecx
ja L(64bytesormore)
cmp $8, %ecx
- PUSH (%ebx)
+
+# ifndef USE_AS_WMEMCMP
+ PUSH (%ebx)
+ jb L(less8bytes)
+# else
jb L(less8bytes)
+ PUSH (%ebx)
+# endif
+
add %ecx, %edx
add %ecx, %eax
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(less8bytes):
mov (%eax), %bl
cmpb (%edx), %bl
@@ -141,22 +154,49 @@ L(less8bytes):
mov 6(%eax), %bl
cmpb 6(%edx), %bl
je L(0bytes)
+
L(nonzero):
- POP (%ebx)
+ POP (%ebx)
mov $1, %eax
ja L(above)
neg %eax
L(above):
ret
CFI_PUSH (%ebx)
+# endif
- ALIGN (4)
+ .p2align 4
L(0bytes):
- POP (%ebx)
+ POP (%ebx)
xor %eax, %eax
ret
- ALIGN (4)
+# ifdef USE_AS_WMEMCMP
+
+/* for wmemcmp, case N == 1 */
+
+ .p2align 4
+L(less8bytes):
+ mov (%eax), %ecx
+ cmp (%edx), %ecx
+ je L(return0)
+ mov $1, %eax
+ jg L(find_diff_bigger)
+ neg %eax
+ ret
+
+ .p2align 4
+L(find_diff_bigger):
+ ret
+
+ .p2align 4
+L(return0):
+ xor %eax, %eax
+ ret
+# endif
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(less1bytes):
jb L(0bytesend)
movzbl (%eax), %eax
@@ -164,14 +204,14 @@ L(less1bytes):
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(0bytesend):
xor %eax, %eax
ret
-
- ALIGN (4)
+# endif
+ .p2align 4
L(64bytesormore):
- PUSH (%ebx)
+ PUSH (%ebx)
mov %ecx, %ebx
mov $64, %ecx
sub $64, %ebx
@@ -208,7 +248,14 @@ L(64bytesormore_loop):
add %ecx, %eax
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
- ALIGN (4)
+# ifdef USE_AS_WMEMCMP
+
+/* Label needs only for table_64bytes filling */
+L(unreal_case):
+/* no code here */
+
+# endif
+ .p2align 4
L(find_16diff):
sub $16, %ecx
L(find_32diff):
@@ -218,9 +265,9 @@ L(find_48diff):
L(find_64diff):
add %ecx, %edx
add %ecx, %eax
- jmp L(16bytes)
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(16bytes):
mov -16(%eax), %ecx
mov -16(%edx), %ebx
@@ -243,8 +290,30 @@ L(4bytes):
mov $0, %eax
jne L(find_diff)
RETURN
+# else
+ .p2align 4
+L(16bytes):
+ mov -16(%eax), %ecx
+ cmp -16(%edx), %ecx
+ jne L(find_diff)
+L(12bytes):
+ mov -12(%eax), %ecx
+ cmp -12(%edx), %ecx
+ jne L(find_diff)
+L(8bytes):
+ mov -8(%eax), %ecx
+ cmp -8(%edx), %ecx
+ jne L(find_diff)
+L(4bytes):
+ mov -4(%eax), %ecx
+ cmp -4(%edx), %ecx
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+# endif
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(49bytes):
movdqu -49(%eax), %xmm1
movdqu -49(%edx), %xmm2
@@ -285,7 +354,7 @@ L(5bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(50bytes):
mov $-50, %ebx
movdqu -50(%eax), %xmm1
@@ -330,7 +399,7 @@ L(2bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(51bytes):
mov $-51, %ebx
movdqu -51(%eax), %xmm1
@@ -378,8 +447,8 @@ L(1bytes):
mov $0, %eax
jne L(end)
RETURN
-
- ALIGN (4)
+# endif
+ .p2align 4
L(52bytes):
movdqu -52(%eax), %xmm1
movdqu -52(%edx), %xmm2
@@ -402,13 +471,18 @@ L(20bytes):
ptest %xmm2, %xmm0
jnc L(less16bytes)
mov -4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -4(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -4(%edx), %ecx
+# endif
mov $0, %eax
jne L(find_diff)
RETURN
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(53bytes):
movdqu -53(%eax), %xmm1
movdqu -53(%edx), %xmm2
@@ -440,7 +514,7 @@ L(21bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(54bytes):
movdqu -54(%eax), %xmm1
movdqu -54(%edx), %xmm2
@@ -476,7 +550,7 @@ L(22bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(55bytes):
movdqu -55(%eax), %xmm1
movdqu -55(%edx), %xmm2
@@ -513,8 +587,8 @@ L(23bytes):
mov $0, %eax
jne L(end)
RETURN
-
- ALIGN (4)
+# endif
+ .p2align 4
L(56bytes):
movdqu -56(%eax), %xmm1
movdqu -56(%edx), %xmm2
@@ -538,18 +612,27 @@ L(24bytes):
jnc L(less16bytes)
mov -8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -8(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -8(%edx), %ecx
+# endif
jne L(find_diff)
mov -4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -4(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -4(%edx), %ecx
+# endif
mov $0, %eax
jne L(find_diff)
RETURN
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(57bytes):
movdqu -57(%eax), %xmm1
movdqu -57(%edx), %xmm2
@@ -585,7 +668,7 @@ L(25bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(58bytes):
movdqu -58(%eax), %xmm1
movdqu -58(%edx), %xmm2
@@ -627,7 +710,7 @@ L(26bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(59bytes):
movdqu -59(%eax), %xmm1
movdqu -59(%edx), %xmm2
@@ -668,8 +751,8 @@ L(27bytes):
mov $0, %eax
jne L(end)
RETURN
-
- ALIGN (4)
+# endif
+ .p2align 4
L(60bytes):
movdqu -60(%eax), %xmm1
movdqu -60(%edx), %xmm2
@@ -691,22 +774,38 @@ L(28bytes):
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
jnc L(less16bytes)
+
mov -12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -12(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -12(%edx), %ecx
+# endif
jne L(find_diff)
+
mov -8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -8(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -8(%edx), %ecx
+# endif
jne L(find_diff)
+
mov -4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -4(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -4(%edx), %ecx
+# endif
mov $0, %eax
jne L(find_diff)
RETURN
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(61bytes):
movdqu -61(%eax), %xmm1
movdqu -61(%edx), %xmm2
@@ -749,7 +848,7 @@ L(29bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(62bytes):
movdqu -62(%eax), %xmm1
movdqu -62(%edx), %xmm2
@@ -792,7 +891,7 @@ L(30bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(63bytes):
movdqu -63(%eax), %xmm1
movdqu -63(%edx), %xmm2
@@ -838,8 +937,9 @@ L(31bytes):
mov $0, %eax
jne L(end)
RETURN
+# endif
- ALIGN (4)
+ .p2align 4
L(64bytes):
movdqu -64(%eax), %xmm1
movdqu -64(%edx), %xmm2
@@ -863,28 +963,45 @@ L(32bytes):
jnc L(less16bytes)
mov -16(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -16(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -16(%edx), %ecx
+# endif
jne L(find_diff)
mov -12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -12(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -12(%edx), %ecx
+# endif
jne L(find_diff)
mov -8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -8(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -8(%edx), %ecx
+# endif
jne L(find_diff)
mov -4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -4(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -4(%edx), %ecx
+# endif
mov $0, %eax
jne L(find_diff)
RETURN
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(less16bytes):
add %ebx, %eax
add %ebx, %edx
@@ -910,9 +1027,35 @@ L(less16bytes):
mov $0, %eax
jne L(find_diff)
RETURN
+# else
+ .p2align 4
+L(less16bytes):
+ add %ebx, %eax
+ add %ebx, %edx
+
+ mov (%eax), %ecx
+ cmp (%edx), %ecx
+ jne L(find_diff)
+
+ mov 4(%eax), %ecx
+ cmp 4(%edx), %ecx
+ jne L(find_diff)
+
+ mov 8(%eax), %ecx
+ cmp 8(%edx), %ecx
+ jne L(find_diff)
+
+ mov 12(%eax), %ecx
+ cmp 12(%edx), %ecx
+
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+# endif
- ALIGN (4)
+ .p2align 4
L(find_diff):
+# ifndef USE_AS_WMEMCMP
cmpb %bl, %cl
jne L(end)
cmp %bx, %cx
@@ -923,17 +1066,29 @@ L(find_diff):
jne L(end)
cmp %bx, %cx
L(end):
- POP (%ebx)
+ POP (%ebx)
mov $1, %eax
ja L(bigger)
neg %eax
L(bigger):
ret
+# else
+ POP (%ebx)
+ mov $1, %eax
+ jg L(bigger)
+ neg %eax
+ ret
+
+ .p2align 4
+L(bigger):
+ ret
+# endif
END (MEMCMP)
.section .rodata.sse4.2,"a",@progbits
- ALIGN (2)
+ .p2align 2
.type L(table_64bytes), @object
+# ifndef USE_AS_WMEMCMP
L(table_64bytes):
.int JMPTBL (L(0bytes), L(table_64bytes))
.int JMPTBL (L(1bytes), L(table_64bytes))
@@ -1000,5 +1155,72 @@ L(table_64bytes):
.int JMPTBL (L(62bytes), L(table_64bytes))
.int JMPTBL (L(63bytes), L(table_64bytes))
.int JMPTBL (L(64bytes), L(table_64bytes))
- .size L(table_64bytes), .-L(table_64bytes)
+# else
+L(table_64bytes):
+ .int JMPTBL (L(0bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(4bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(8bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(12bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(16bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(20bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(24bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(28bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(32bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(36bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(40bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(44bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(48bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(52bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(56bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(60bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(64bytes), L(table_64bytes))
+# endif
#endif
diff --git a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
index 2e0d15fe55..eab85c1de1 100644
--- a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
+++ b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
@@ -1,5 +1,5 @@
-/* memcmp with SSSE3
- Copyright (C) 2010 Free Software Foundation, Inc.
+/* memcmp with SSSE3, wmemcmp with SSSE3
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -20,47 +20,64 @@
#ifndef NOT_IN_libc
-#include <sysdep.h>
-#include "asm-syntax.h"
+# include <sysdep.h>
-#ifndef MEMCMP
-# define MEMCMP __memcmp_ssse3
-#endif
+# ifndef MEMCMP
+# define MEMCMP __memcmp_ssse3
+# endif
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
-#define CFI_PUSH(REG) \
- cfi_adjust_cfa_offset (4); \
- cfi_rel_offset (REG, 0)
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
-#define CFI_POP(REG) \
- cfi_adjust_cfa_offset (-4); \
- cfi_restore (REG)
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
-#define PUSH(REG) pushl REG; CFI_PUSH (REG)
-#define POP(REG) popl REG; CFI_POP (REG)
+# define PARMS 4
+# define BLK1 PARMS
+# define BLK2 BLK1+4
+# define LEN BLK2+4
+# define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
+# define RETURN RETURN_END; cfi_restore_state; cfi_remember_state
-#define PARMS 4
-#define BLK1 PARMS
-#define BLK2 BLK1+4
-#define LEN BLK2+4
-#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
-#define RETURN RETURN_END; cfi_restore_state; cfi_remember_state
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
- .section .text.ssse3,"ax",@progbits
+ atom_text_section
ENTRY (MEMCMP)
movl LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+ shl $2, %ecx
+ test %ecx, %ecx
+ jz L(zero)
+# endif
+
movl BLK1(%esp), %eax
cmp $48, %ecx
movl BLK2(%esp), %edx
jae L(48bytesormore)
+
+# ifndef USE_AS_WMEMCMP
cmp $1, %ecx
jbe L(less1bytes)
- PUSH (%ebx)
+# endif
+
+ PUSH (%ebx)
add %ecx, %edx
add %ecx, %eax
jmp L(less48bytes)
- ALIGN (4)
- CFI_POP (%ebx)
+ CFI_POP (%ebx)
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(less1bytes):
jb L(zero)
movb (%eax), %cl
@@ -71,29 +88,30 @@ L(less1bytes):
neg %eax
L(1bytesend):
ret
+# endif
- ALIGN (4)
+ .p2align 4
L(zero):
- mov $0, %eax
+ xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(48bytesormore):
- PUSH (%ebx)
- PUSH (%esi)
- PUSH (%edi)
+ PUSH (%ebx)
+ PUSH (%esi)
+ PUSH (%edi)
cfi_remember_state
- movdqu (%eax), %xmm3
- movdqu (%edx), %xmm0
+ movdqu (%eax), %xmm3
+ movdqu (%edx), %xmm0
movl %eax, %edi
movl %edx, %esi
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %edx
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
lea 16(%edi), %edi
- sub $0xffff, %edx
+ sub $0xffff, %edx
lea 16(%esi), %esi
- jnz L(less16bytes)
+ jnz L(less16bytes)
mov %edi, %edx
and $0xf, %edx
xor %edx, %edi
@@ -104,6 +122,7 @@ L(48bytesormore):
jz L(shr_0)
xor %edx, %esi
+# ifndef USE_AS_WMEMCMP
cmp $8, %edx
jae L(next_unaligned_table)
cmp $0, %edx
@@ -122,7 +141,7 @@ L(48bytesormore):
je L(shr_6)
jmp L(shr_7)
- ALIGN (4)
+ .p2align 2
L(next_unaligned_table):
cmp $8, %edx
je L(shr_8)
@@ -139,8 +158,17 @@ L(next_unaligned_table):
cmp $14, %edx
je L(shr_14)
jmp L(shr_15)
+# else
+ cmp $0, %edx
+ je L(shr_0)
+ cmp $4, %edx
+ je L(shr_4)
+ cmp $8, %edx
+ je L(shr_8)
+ jmp L(shr_12)
+# endif
- ALIGN (4)
+ .p2align 4
L(shr_0):
cmp $80, %ecx
jae L(shr_0_gobble)
@@ -159,13 +187,13 @@ L(shr_0):
lea (%ecx, %edi,1), %eax
lea (%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_0_gobble):
lea -48(%ecx), %ecx
movdqa (%esi), %xmm0
@@ -205,13 +233,14 @@ L(shr_0_gobble_loop_next):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea (%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_1):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -235,13 +264,13 @@ L(shr_1):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 1(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_1_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -288,14 +317,14 @@ L(shr_1_gobble_next):
lea (%ecx, %edi,1), %eax
lea 1(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_2):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -319,13 +348,13 @@ L(shr_2):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 2(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_2_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -372,13 +401,13 @@ L(shr_2_gobble_next):
lea (%ecx, %edi,1), %eax
lea 2(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_3):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -402,13 +431,13 @@ L(shr_3):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 3(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_3_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -455,13 +484,14 @@ L(shr_3_gobble_next):
lea (%ecx, %edi,1), %eax
lea 3(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# endif
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_4):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -485,13 +515,13 @@ L(shr_4):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 4(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_4_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -538,13 +568,14 @@ L(shr_4_gobble_next):
lea (%ecx, %edi,1), %eax
lea 4(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_5):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -568,13 +599,13 @@ L(shr_5):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 5(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_5_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -621,13 +652,13 @@ L(shr_5_gobble_next):
lea (%ecx, %edi,1), %eax
lea 5(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_6):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -651,13 +682,13 @@ L(shr_6):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 6(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_6_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -704,13 +735,13 @@ L(shr_6_gobble_next):
lea (%ecx, %edi,1), %eax
lea 6(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_7):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -734,13 +765,13 @@ L(shr_7):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 7(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_7_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -787,13 +818,14 @@ L(shr_7_gobble_next):
lea (%ecx, %edi,1), %eax
lea 7(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# endif
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_8):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -817,13 +849,13 @@ L(shr_8):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 8(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_8_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -870,13 +902,14 @@ L(shr_8_gobble_next):
lea (%ecx, %edi,1), %eax
lea 8(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_9):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -900,13 +933,13 @@ L(shr_9):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 9(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_9_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -953,13 +986,13 @@ L(shr_9_gobble_next):
lea (%ecx, %edi,1), %eax
lea 9(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_10):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -983,13 +1016,13 @@ L(shr_10):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 10(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_10_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1036,13 +1069,13 @@ L(shr_10_gobble_next):
lea (%ecx, %edi,1), %eax
lea 10(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_11):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1066,13 +1099,13 @@ L(shr_11):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 11(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_11_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1119,13 +1152,14 @@ L(shr_11_gobble_next):
lea (%ecx, %edi,1), %eax
lea 11(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# endif
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_12):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1149,13 +1183,13 @@ L(shr_12):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 12(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_12_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1202,13 +1236,14 @@ L(shr_12_gobble_next):
lea (%ecx, %edi,1), %eax
lea 12(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_13):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1232,13 +1267,13 @@ L(shr_13):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 13(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_13_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1285,13 +1320,13 @@ L(shr_13_gobble_next):
lea (%ecx, %edi,1), %eax
lea 13(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_14):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1315,13 +1350,13 @@ L(shr_14):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 14(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_14_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1368,13 +1403,13 @@ L(shr_14_gobble_next):
lea (%ecx, %edi,1), %eax
lea 14(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_15):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1398,13 +1433,13 @@ L(shr_15):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 15(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_15_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1451,13 +1486,14 @@ L(shr_15_gobble_next):
lea (%ecx, %edi,1), %eax
lea 15(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# endif
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(exit):
pmovmskb %xmm1, %ebx
sub $0xffff, %ebx
@@ -1465,9 +1501,12 @@ L(exit):
lea -16(%esi), %esi
lea -16(%edi), %edi
mov %ebx, %edx
+
L(first16bytes):
add %eax, %esi
L(less16bytes):
+
+# ifndef USE_AS_WMEMCMP
test %dl, %dl
jz L(next_24_bytes)
@@ -1492,61 +1531,61 @@ L(less16bytes):
test $0x40, %dl
jnz L(Byte22)
L(Byte23):
- movzbl -9(%edi), %eax
- movzbl -9(%esi), %edx
+ movzbl -9(%edi), %eax
+ movzbl -9(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte16):
- movzbl -16(%edi), %eax
- movzbl -16(%esi), %edx
+ movzbl -16(%edi), %eax
+ movzbl -16(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte17):
- movzbl -15(%edi), %eax
- movzbl -15(%esi), %edx
+ movzbl -15(%edi), %eax
+ movzbl -15(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte18):
- movzbl -14(%edi), %eax
- movzbl -14(%esi), %edx
+ movzbl -14(%edi), %eax
+ movzbl -14(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte19):
- movzbl -13(%edi), %eax
- movzbl -13(%esi), %edx
+ movzbl -13(%edi), %eax
+ movzbl -13(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte20):
- movzbl -12(%edi), %eax
- movzbl -12(%esi), %edx
+ movzbl -12(%edi), %eax
+ movzbl -12(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte21):
- movzbl -11(%edi), %eax
- movzbl -11(%esi), %edx
+ movzbl -11(%edi), %eax
+ movzbl -11(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte22):
- movzbl -10(%edi), %eax
- movzbl -10(%esi), %edx
+ movzbl -10(%edi), %eax
+ movzbl -10(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(next_24_bytes):
lea 8(%edi), %edi
lea 8(%esi), %esi
@@ -1571,20 +1610,69 @@ L(next_24_bytes):
test $0x40, %dh
jnz L(Byte22)
- ALIGN (4)
+ .p2align 4
L(Byte31):
- movzbl -9(%edi), %eax
- movzbl -9(%esi), %edx
+ movzbl -9(%edi), %eax
+ movzbl -9(%esi), %edx
sub %edx, %eax
RETURN_END
+# else
+
+/* special for wmemcmp */
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words)
+ and $15, %dl
+ jz L(second_double_word)
+ mov -16(%edi), %eax
+ cmp -16(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(second_double_word):
+ mov -12(%edi), %eax
+ cmp -12(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(next_two_double_words):
+ and $15, %dh
+ jz L(fourth_double_word)
+ mov -8(%edi), %eax
+ cmp -8(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(fourth_double_word):
+ mov -4(%edi), %eax
+ cmp -4(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(nequal):
+ mov $1, %eax
+ jg L(nequal_bigger)
+ neg %eax
+ RETURN
+
+ .p2align 4
+L(nequal_bigger):
+ RETURN_END
+# endif
CFI_PUSH (%ebx)
- ALIGN (4)
+
+ .p2align 4
L(more8bytes):
cmp $16, %ecx
jae L(more16bytes)
cmp $8, %ecx
je L(8bytes)
+# ifndef USE_AS_WMEMCMP
cmp $9, %ecx
je L(9bytes)
cmp $10, %ecx
@@ -1598,13 +1686,17 @@ L(more8bytes):
cmp $14, %ecx
je L(14bytes)
jmp L(15bytes)
+# else
+ jmp L(12bytes)
+# endif
- ALIGN (4)
+ .p2align 4
L(more16bytes):
cmp $24, %ecx
jae L(more24bytes)
cmp $16, %ecx
je L(16bytes)
+# ifndef USE_AS_WMEMCMP
cmp $17, %ecx
je L(17bytes)
cmp $18, %ecx
@@ -1618,13 +1710,17 @@ L(more16bytes):
cmp $22, %ecx
je L(22bytes)
jmp L(23bytes)
+# else
+ jmp L(20bytes)
+# endif
- ALIGN (4)
+ .p2align 4
L(more24bytes):
cmp $32, %ecx
jae L(more32bytes)
cmp $24, %ecx
je L(24bytes)
+# ifndef USE_AS_WMEMCMP
cmp $25, %ecx
je L(25bytes)
cmp $26, %ecx
@@ -1638,13 +1734,17 @@ L(more24bytes):
cmp $30, %ecx
je L(30bytes)
jmp L(31bytes)
+# else
+ jmp L(28bytes)
+# endif
- ALIGN (4)
+ .p2align 4
L(more32bytes):
cmp $40, %ecx
jae L(more40bytes)
cmp $32, %ecx
je L(32bytes)
+# ifndef USE_AS_WMEMCMP
cmp $33, %ecx
je L(33bytes)
cmp $34, %ecx
@@ -1658,11 +1758,35 @@ L(more32bytes):
cmp $38, %ecx
je L(38bytes)
jmp L(39bytes)
+# else
+ jmp L(36bytes)
+# endif
+
+ .p2align 4
+L(less48bytes):
+ cmp $8, %ecx
+ jae L(more8bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $2, %ecx
+ je L(2bytes)
+ cmp $3, %ecx
+ je L(3bytes)
+ cmp $4, %ecx
+ je L(4bytes)
+ cmp $5, %ecx
+ je L(5bytes)
+ cmp $6, %ecx
+ je L(6bytes)
+ jmp L(7bytes)
+# else
+ jmp L(4bytes)
+# endif
- ALIGN (4)
+ .p2align 4
L(more40bytes):
cmp $40, %ecx
je L(40bytes)
+# ifndef USE_AS_WMEMCMP
cmp $41, %ecx
je L(41bytes)
cmp $42, %ecx
@@ -1677,23 +1801,7 @@ L(more40bytes):
je L(46bytes)
jmp L(47bytes)
- ALIGN (4)
-L(less48bytes):
- cmp $8, %ecx
- jae L(more8bytes)
- cmp $2, %ecx
- je L(2bytes)
- cmp $3, %ecx
- je L(3bytes)
- cmp $4, %ecx
- je L(4bytes)
- cmp $5, %ecx
- je L(5bytes)
- cmp $6, %ecx
- je L(6bytes)
- jmp L(7bytes)
-
- ALIGN (4)
+ .p2align 4
L(44bytes):
mov -44(%eax), %ecx
mov -44(%edx), %ebx
@@ -1750,11 +1858,64 @@ L(4bytes):
cmp %ebx, %ecx
mov $0, %eax
jne L(find_diff)
- POP (%ebx)
+ POP (%ebx)
+ ret
+ CFI_PUSH (%ebx)
+# else
+ .p2align 4
+L(44bytes):
+ mov -44(%eax), %ecx
+ cmp -44(%edx), %ecx
+ jne L(find_diff)
+L(40bytes):
+ mov -40(%eax), %ecx
+ cmp -40(%edx), %ecx
+ jne L(find_diff)
+L(36bytes):
+ mov -36(%eax), %ecx
+ cmp -36(%edx), %ecx
+ jne L(find_diff)
+L(32bytes):
+ mov -32(%eax), %ecx
+ cmp -32(%edx), %ecx
+ jne L(find_diff)
+L(28bytes):
+ mov -28(%eax), %ecx
+ cmp -28(%edx), %ecx
+ jne L(find_diff)
+L(24bytes):
+ mov -24(%eax), %ecx
+ cmp -24(%edx), %ecx
+ jne L(find_diff)
+L(20bytes):
+ mov -20(%eax), %ecx
+ cmp -20(%edx), %ecx
+ jne L(find_diff)
+L(16bytes):
+ mov -16(%eax), %ecx
+ cmp -16(%edx), %ecx
+ jne L(find_diff)
+L(12bytes):
+ mov -12(%eax), %ecx
+ cmp -12(%edx), %ecx
+ jne L(find_diff)
+L(8bytes):
+ mov -8(%eax), %ecx
+ cmp -8(%edx), %ecx
+ jne L(find_diff)
+L(4bytes):
+ mov -4(%eax), %ecx
+ xor %eax, %eax
+ cmp -4(%edx), %ecx
+ jne L(find_diff)
+ POP (%ebx)
ret
CFI_PUSH (%ebx)
+# endif
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+
+ .p2align 4
L(45bytes):
mov -45(%eax), %ecx
mov -45(%edx), %ebx
@@ -1814,11 +1975,11 @@ L(5bytes):
cmp -1(%edx), %cl
mov $0, %eax
jne L(end)
- POP (%ebx)
+ POP (%ebx)
ret
CFI_PUSH (%ebx)
- ALIGN (4)
+ .p2align 4
L(46bytes):
mov -46(%eax), %ecx
mov -46(%edx), %ebx
@@ -1882,11 +2043,11 @@ L(2bytes):
cmp %bh, %ch
mov $0, %eax
jne L(end)
- POP (%ebx)
+ POP (%ebx)
ret
CFI_PUSH (%ebx)
- ALIGN (4)
+ .p2align 4
L(47bytes):
movl -47(%eax), %ecx
movl -47(%edx), %ebx
@@ -1953,11 +2114,11 @@ L(3bytes):
cmpb -1(%edx), %al
mov $0, %eax
jne L(end)
- POP (%ebx)
+ POP (%ebx)
ret
CFI_PUSH (%ebx)
- ALIGN (4)
+ .p2align 4
L(find_diff):
cmpb %bl, %cl
jne L(end)
@@ -1968,14 +2129,30 @@ L(find_diff):
cmp %bl, %cl
jne L(end)
cmp %bx, %cx
+
+ .p2align 4
L(end):
- POP (%ebx)
+ POP (%ebx)
mov $1, %eax
ja L(bigger)
neg %eax
L(bigger):
ret
+# else
-END (MEMCMP)
+/* for wmemcmp */
+ .p2align 4
+L(find_diff):
+ POP (%ebx)
+ mov $1, %eax
+ jg L(find_diff_bigger)
+ neg %eax
+ ret
+ .p2align 4
+L(find_diff_bigger):
+ ret
+
+# endif
+END (MEMCMP)
#endif
diff --git a/sysdeps/i386/i686/multiarch/wmemcmp-c.c b/sysdeps/i386/i686/multiarch/wmemcmp-c.c
new file mode 100644
index 0000000000..94ff6151f2
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wmemcmp-c.c
@@ -0,0 +1,5 @@
+#ifndef NOT_IN_libc
+# define WMEMCMP __wmemcmp_ia32
+#endif
+
+#include "wcsmbs/wmemcmp.c"
diff --git a/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S b/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
new file mode 100644
index 0000000000..1a857c7e21
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_sse4_2
+
+#include "memcmp-sse4.S"
diff --git a/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S b/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
new file mode 100644
index 0000000000..a41ef95fc1
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_ssse3
+
+#include "memcmp-ssse3.S"
diff --git a/sysdeps/i386/i686/multiarch/wmemcmp.S b/sysdeps/i386/i686/multiarch/wmemcmp.S
new file mode 100644
index 0000000000..5080c14ea7
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wmemcmp.S
@@ -0,0 +1,59 @@
+/* Multiple versions of wmemcmp
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+
+#ifndef NOT_IN_libc
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+ __i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+ .text
+ENTRY(wmemcmp)
+ .type wmemcmp, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+1: leal __wmemcmp_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __wmemcmp_ssse3@GOTOFF(%ebx), %eax
+ testl $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __wmemcmp_sse4_2@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ ret
+END(wmemcmp)
+#endif
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index a5254dc93c..e0bb9847a8 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -15,7 +15,8 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
- strrchr-sse2-no-bsf strchr-sse2-no-bsf
+ strrchr-sse2-no-bsf strchr-sse2-no-bsf \
+ memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
index fc439bb013..28dd505d99 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -1,5 +1,5 @@
-/* memcmp with SSE4.1
- Copyright (C) 2010 Free Software Foundation, Inc.
+/* memcmp with SSE4.1, wmemcmp with SSE4.1
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -20,43 +20,54 @@
#ifndef NOT_IN_libc
-#include <sysdep.h>
-#include "asm-syntax.h"
+# include <sysdep.h>
-#ifndef MEMCMP
-# define MEMCMP __memcmp_sse4_1
-#endif
+# ifndef MEMCMP
+# define MEMCMP __memcmp_sse4_1
+# endif
-#ifndef ALIGN
-# define ALIGN(n) .p2align n
-#endif
+# ifndef ALIGN
+# define ALIGN(n) .p2align n
+# endif
-#define JMPTBL(I, B) (I - B)
+# define JMPTBL(I, B) (I - B)
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
lea TABLE(%rip), %r11; \
movslq (%r11, INDEX, SCALE), %rcx; \
add %r11, %rcx; \
jmp *%rcx; \
ud2
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
.section .text.sse4.1,"ax",@progbits
ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+ shl $2, %rdx
+# endif
pxor %xmm0, %xmm0
cmp $79, %rdx
ja L(79bytesormore)
+# ifndef USE_AS_WMEMCMP
cmp $1, %rdx
je L(firstbyte)
+# endif
add %rdx, %rsi
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+# ifndef USE_AS_WMEMCMP
ALIGN (4)
L(firstbyte):
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
sub %ecx, %eax
ret
+# endif
ALIGN (4)
L(79bytesormore):
@@ -308,11 +319,11 @@ L(less32bytesin256):
ALIGN (4)
L(512bytesormore):
-#ifdef DATA_CACHE_SIZE_HALF
+# ifdef DATA_CACHE_SIZE_HALF
mov $DATA_CACHE_SIZE_HALF, %r8
-#else
+# else
mov __x86_64_data_cache_size_half(%rip), %r8
-#endif
+# endif
mov %r8, %r9
shr $1, %r8
add %r9, %r8
@@ -624,11 +635,11 @@ L(less32bytesin256in2alinged):
ALIGN (4)
L(512bytesormorein2aligned):
-#ifdef DATA_CACHE_SIZE_HALF
+# ifdef DATA_CACHE_SIZE_HALF
mov $DATA_CACHE_SIZE_HALF, %r8
-#else
+# else
mov __x86_64_data_cache_size_half(%rip), %r8
-#endif
+# endif
mov %r8, %r9
shr $1, %r8
add %r9, %r8
@@ -667,6 +678,7 @@ L(64bytesormore_loopin2aligned):
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
L(L2_L3_cache_aglined):
sub $64, %rdx
+
ALIGN (4)
L(L2_L3_aligned_128bytes_loop):
prefetchnta 0x1c0(%rdi)
@@ -803,13 +815,19 @@ L(12bytes):
jne L(diffin8bytes)
L(4bytes):
mov -4(%rsi), %ecx
+# ifndef USE_AS_WMEMCMP
mov -4(%rdi), %eax
cmp %eax, %ecx
+# else
+ cmp -4(%rdi), %ecx
+# endif
jne L(diffin4bytes)
L(0bytes):
xor %eax, %eax
ret
+# ifndef USE_AS_WMEMCMP
+/* unreal case for wmemcmp */
ALIGN (4)
L(65bytes):
movdqu -65(%rdi), %xmm1
@@ -1017,6 +1035,7 @@ L(1bytes):
movzbl -1(%rsi), %ecx
sub %ecx, %eax
ret
+# endif
ALIGN (4)
L(68bytes):
@@ -1047,13 +1066,20 @@ L(20bytes):
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
jnc L(less16bytes)
- mov -4(%rdi), %eax
mov -4(%rsi), %ecx
+
+# ifndef USE_AS_WMEMCMP
+ mov -4(%rdi), %eax
cmp %eax, %ecx
+# else
+ cmp -4(%rdi), %ecx
+# endif
jne L(diffin4bytes)
xor %eax, %eax
ret
+# ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
ALIGN (4)
L(69bytes):
movdqu -69(%rsi), %xmm1
@@ -1161,6 +1187,7 @@ L(23bytes):
jne L(diffin8bytes)
xor %eax, %eax
ret
+# endif
ALIGN (4)
L(72bytes):
@@ -1191,13 +1218,16 @@ L(24bytes):
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
jnc L(less16bytes)
- mov -8(%rdi), %rax
+
mov -8(%rsi), %rcx
+ mov -8(%rdi), %rax
cmp %rax, %rcx
jne L(diffin8bytes)
xor %eax, %eax
ret
+# ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
ALIGN (4)
L(73bytes):
movdqu -73(%rsi), %xmm1
@@ -1312,7 +1342,7 @@ L(27bytes):
jne L(diffin4bytes)
xor %eax, %eax
ret
-
+# endif
ALIGN (4)
L(76bytes):
movdqu -76(%rsi), %xmm1
@@ -1346,13 +1376,19 @@ L(28bytes):
mov -12(%rsi), %rcx
cmp %rax, %rcx
jne L(diffin8bytes)
- mov -4(%rdi), %eax
mov -4(%rsi), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -4(%rdi), %eax
cmp %eax, %ecx
+# else
+ cmp -4(%rdi), %ecx
+# endif
jne L(diffin4bytes)
xor %eax, %eax
ret
+# ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
ALIGN (4)
L(77bytes):
movdqu -77(%rsi), %xmm1
@@ -1474,7 +1510,7 @@ L(31bytes):
jne L(diffin8bytes)
xor %eax, %eax
ret
-
+# endif
ALIGN (4)
L(64bytes):
movdqu -64(%rdi), %xmm2
@@ -1527,7 +1563,17 @@ L(diffin8bytes):
jne L(diffin4bytes)
shr $32, %rcx
shr $32, %rax
+
+# ifdef USE_AS_WMEMCMP
+/* for wmemcmp */
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+# endif
+
L(diffin4bytes):
+# ifndef USE_AS_WMEMCMP
cmp %cx, %ax
jne L(diffin2bytes)
shr $16, %ecx
@@ -1546,11 +1592,28 @@ L(end):
and $0xff, %ecx
sub %ecx, %eax
ret
+# else
+
+/* for wmemcmp */
+ mov $1, %eax
+ jl L(nequal_bigger)
+ neg %eax
+ ret
+
+ ALIGN (4)
+L(nequal_bigger):
+ ret
+
+L(unreal_case):
+ xor %eax, %eax
+ ret
+# endif
END (MEMCMP)
.section .rodata.sse4.1,"a",@progbits
ALIGN (3)
+# ifndef USE_AS_WMEMCMP
L(table_64bytes):
.int JMPTBL (L(0bytes), L(table_64bytes))
.int JMPTBL (L(1bytes), L(table_64bytes))
@@ -1632,4 +1695,87 @@ L(table_64bytes):
.int JMPTBL (L(77bytes), L(table_64bytes))
.int JMPTBL (L(78bytes), L(table_64bytes))
.int JMPTBL (L(79bytes), L(table_64bytes))
+# else
+L(table_64bytes):
+ .int JMPTBL (L(0bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(4bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(8bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(12bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(16bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(20bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(24bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(28bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(32bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(36bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(40bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(44bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(48bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(52bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(56bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(60bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(64bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(68bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(72bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(76bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+# endif
#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
new file mode 100644
index 0000000000..b3a2ca1edd
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
@@ -0,0 +1,1997 @@
+/* memcmp with SSSE3, wmemcmp with SSSE3
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+# define MEMCMP __memcmp_ssse3
+# endif
+
+# ifndef ALIGN
+# define ALIGN(n) .p2align n
+# endif
+
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+ atom_text_section
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+ shl $2, %rdx
+ test %rdx, %rdx
+ jz L(equal)
+# endif
+ mov %rdx, %rcx
+ mov %rdi, %rdx
+ cmp $48, %rcx;
+ jae L(48bytesormore) /* LEN => 48 */
+
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+/* ECX >= 32. */
+L(48bytesormore):
+ movdqu (%rdi), %xmm3
+ movdqu (%rsi), %xmm0
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 16(%rdi), %rdi
+ lea 16(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(less16bytes)
+ mov %edi, %edx
+ and $0xf, %edx
+ xor %rdx, %rdi
+ sub %rdx, %rsi
+ add %rdx, %rcx
+ mov %esi, %edx
+ and $0xf, %edx
+ jz L(shr_0)
+ xor %rdx, %rsi
+
+# ifndef USE_AS_WMEMCMP
+ cmp $8, %edx
+ jae L(next_unaligned_table)
+ cmp $0, %edx
+ je L(shr_0)
+ cmp $1, %edx
+ je L(shr_1)
+ cmp $2, %edx
+ je L(shr_2)
+ cmp $3, %edx
+ je L(shr_3)
+ cmp $4, %edx
+ je L(shr_4)
+ cmp $5, %edx
+ je L(shr_5)
+ cmp $6, %edx
+ je L(shr_6)
+ jmp L(shr_7)
+
+ ALIGN (2)
+L(next_unaligned_table):
+ cmp $8, %edx
+ je L(shr_8)
+ cmp $9, %edx
+ je L(shr_9)
+ cmp $10, %edx
+ je L(shr_10)
+ cmp $11, %edx
+ je L(shr_11)
+ cmp $12, %edx
+ je L(shr_12)
+ cmp $13, %edx
+ je L(shr_13)
+ cmp $14, %edx
+ je L(shr_14)
+ jmp L(shr_15)
+# else
+ cmp $0, %edx
+ je L(shr_0)
+ cmp $4, %edx
+ je L(shr_4)
+ cmp $8, %edx
+ je L(shr_8)
+ jmp L(shr_12)
+# endif
+
+ ALIGN (4)
+L(shr_0):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ jae L(shr_0_gobble)
+ xor %eax, %eax
+ movdqa (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+ movdqa 16(%rsi), %xmm2
+ pcmpeqb 16(%rdi), %xmm2
+ pand %xmm1, %xmm2
+ pmovmskb %xmm2, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_0_gobble):
+ movdqa (%rsi), %xmm0
+ xor %eax, %eax
+ pcmpeqb (%rdi), %xmm0
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm2
+ pcmpeqb 16(%rdi), %xmm2
+L(shr_0_gobble_loop):
+ pand %xmm0, %xmm2
+ sub $32, %rcx
+ pmovmskb %xmm2, %edx
+ movdqa %xmm0, %xmm1
+ movdqa 32(%rsi), %xmm0
+ movdqa 48(%rsi), %xmm2
+ sbb $0xffff, %edx
+ pcmpeqb 32(%rdi), %xmm0
+ pcmpeqb 48(%rdi), %xmm2
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ jz L(shr_0_gobble_loop)
+
+ pand %xmm0, %xmm2
+ cmp $0, %rcx
+ jge L(next)
+ inc %edx
+ add $32, %rcx
+L(next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm2, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+ ALIGN (4)
+L(shr_1):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_1_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $1, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $1, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $1, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_1_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $1, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $1, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_1_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $1, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $1, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_1_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_1_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_1_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 1(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+
+ ALIGN (4)
+L(shr_2):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_2_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $2, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $2, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $2, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_2_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $2, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $2, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_2_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $2, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $2, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_2_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_2_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_2_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 2(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_3):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_3_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $3, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $3, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $3, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_3_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $3, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $3, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_3_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $3, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $3, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_3_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_3_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_3_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 3(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+# endif
+
+ ALIGN (4)
+L(shr_4):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_4_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $4, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $4, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $4, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_4_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $4, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $4, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_4_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $4, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $4, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_4_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_4_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_4_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 4(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+ ALIGN (4)
+L(shr_5):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_5_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $5, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $5, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $5, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_5_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $5, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $5, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_5_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $5, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $5, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_5_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_5_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_5_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 5(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_6):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_6_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $6, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $6, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $6, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_6_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $6, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $6, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_6_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $6, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $6, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_6_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_6_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_6_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 6(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_7):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_7_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $7, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $7, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $7, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_7_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $7, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $7, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_7_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $7, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $7, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_7_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_7_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_7_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 7(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+# endif
+
+ ALIGN (4)
+L(shr_8):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_8_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $8, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $8, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $8, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_8_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $8, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $8, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_8_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $8, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $8, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_8_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_8_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_8_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 8(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+ ALIGN (4)
+L(shr_9):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_9_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $9, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $9, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $9, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_9_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $9, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $9, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_9_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $9, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $9, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_9_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_9_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_9_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 9(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_10):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_10_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $10, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $10, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $10, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_10_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $10, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $10, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_10_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $10, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $10, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_10_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_10_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_10_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 10(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_11):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_11_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $11, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $11, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $11, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_11_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $11, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $11, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_11_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $11, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $11, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_11_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_11_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_11_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 11(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+# endif
+
+ ALIGN (4)
+L(shr_12):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_12_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $12, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $12, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $12, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_12_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $12, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $12, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_12_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $12, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $12, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_12_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_12_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_12_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 12(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+ ALIGN (4)
+L(shr_13):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_13_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $13, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $13, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $13, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_13_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $13, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $13, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_13_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $13, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $13, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_13_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_13_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_13_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 13(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_14):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_14_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $14, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $14, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $14, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_14_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $14, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $14, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_14_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $14, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $14, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_14_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_14_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_14_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 14(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_15):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_15_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $15, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $15, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $15, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_15_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $15, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $15, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_15_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $15, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $15, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_15_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_15_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_15_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 15(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+# endif
+ ALIGN (4)
+L(exit):
+ pmovmskb %xmm1, %r8d
+ sub $0xffff, %r8d
+ jz L(first16bytes)
+ lea -16(%rsi), %rsi
+ lea -16(%rdi), %rdi
+ mov %r8d, %edx
+L(first16bytes):
+ add %rax, %rsi
+L(less16bytes):
+# ifndef USE_AS_WMEMCMP
+ test %dl, %dl
+ jz L(next_24_bytes)
+
+ test $0x01, %dl
+ jnz L(Byte16)
+
+ test $0x02, %dl
+ jnz L(Byte17)
+
+ test $0x04, %dl
+ jnz L(Byte18)
+
+ test $0x08, %dl
+ jnz L(Byte19)
+
+ test $0x10, %dl
+ jnz L(Byte20)
+
+ test $0x20, %dl
+ jnz L(Byte21)
+
+ test $0x40, %dl
+ jnz L(Byte22)
+
+ movzbl -9(%rdi), %eax
+ movzbl -9(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(Byte16):
+ movzbl -16(%rdi), %eax
+ movzbl -16(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(Byte17):
+ movzbl -15(%rdi), %eax
+ movzbl -15(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(Byte18):
+ movzbl -14(%rdi), %eax
+ movzbl -14(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(Byte19):
+ movzbl -13(%rdi), %eax
+ movzbl -13(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(Byte20):
+ movzbl -12(%rdi), %eax
+ movzbl -12(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(Byte21):
+ movzbl -11(%rdi), %eax
+ movzbl -11(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(Byte22):
+ movzbl -10(%rdi), %eax
+ movzbl -10(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(next_24_bytes):
+ lea 8(%rdi), %rdi
+ lea 8(%rsi), %rsi
+ test $0x01, %dh
+ jnz L(Byte16)
+
+ test $0x02, %dh
+ jnz L(Byte17)
+
+ test $0x04, %dh
+ jnz L(Byte18)
+
+ test $0x08, %dh
+ jnz L(Byte19)
+
+ test $0x10, %dh
+ jnz L(Byte20)
+
+ test $0x20, %dh
+ jnz L(Byte21)
+
+ test $0x40, %dh
+ jnz L(Byte22)
+
+ mov -9(%rdi), %eax
+ and $0xff, %eax
+ mov -9(%rsi), %edx
+ and $0xff, %edx
+ sub %edx, %eax
+ ret
+# else
+/* special for wmemcmp */
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words)
+ and $15, %dl
+ jz L(second_double_word)
+ mov -16(%rdi), %eax
+ cmp -16(%rsi), %eax
+ jne L(find_diff)
+ ret
+
+ ALIGN (4)
+L(second_double_word):
+ mov -12(%rdi), %eax
+ cmp -12(%rsi), %eax
+ jne L(find_diff)
+ ret
+
+ ALIGN (4)
+L(next_two_double_words):
+ and $15, %dh
+ jz L(fourth_double_word)
+ mov -8(%rdi), %eax
+ cmp -8(%rsi), %eax
+ jne L(find_diff)
+ ret
+
+ ALIGN (4)
+L(fourth_double_word):
+ mov -4(%rdi), %eax
+ cmp -4(%rsi), %eax
+ jne L(find_diff)
+ ret
+# endif
+
+ ALIGN (4)
+L(less48bytes):
+ cmp $8, %ecx
+ jae L(more8bytes)
+ cmp $0, %ecx
+ je L(0bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $1, %ecx
+ je L(1bytes)
+ cmp $2, %ecx
+ je L(2bytes)
+ cmp $3, %ecx
+ je L(3bytes)
+ cmp $4, %ecx
+ je L(4bytes)
+ cmp $5, %ecx
+ je L(5bytes)
+ cmp $6, %ecx
+ je L(6bytes)
+ jmp L(7bytes)
+# else
+ jmp L(4bytes)
+# endif
+
+ ALIGN (4)
+L(more8bytes):
+ cmp $16, %ecx
+ jae L(more16bytes)
+ cmp $8, %ecx
+ je L(8bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $9, %ecx
+ je L(9bytes)
+ cmp $10, %ecx
+ je L(10bytes)
+ cmp $11, %ecx
+ je L(11bytes)
+ cmp $12, %ecx
+ je L(12bytes)
+ cmp $13, %ecx
+ je L(13bytes)
+ cmp $14, %ecx
+ je L(14bytes)
+ jmp L(15bytes)
+# else
+ jmp L(12bytes)
+# endif
+
+ ALIGN (4)
+L(more16bytes):
+ cmp $24, %ecx
+ jae L(more24bytes)
+ cmp $16, %ecx
+ je L(16bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $17, %ecx
+ je L(17bytes)
+ cmp $18, %ecx
+ je L(18bytes)
+ cmp $19, %ecx
+ je L(19bytes)
+ cmp $20, %ecx
+ je L(20bytes)
+ cmp $21, %ecx
+ je L(21bytes)
+ cmp $22, %ecx
+ je L(22bytes)
+ jmp L(23bytes)
+# else
+ jmp L(20bytes)
+# endif
+
+ ALIGN (4)
+L(more24bytes):
+ cmp $32, %ecx
+ jae L(more32bytes)
+ cmp $24, %ecx
+ je L(24bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $25, %ecx
+ je L(25bytes)
+ cmp $26, %ecx
+ je L(26bytes)
+ cmp $27, %ecx
+ je L(27bytes)
+ cmp $28, %ecx
+ je L(28bytes)
+ cmp $29, %ecx
+ je L(29bytes)
+ cmp $30, %ecx
+ je L(30bytes)
+ jmp L(31bytes)
+# else
+ jmp L(28bytes)
+# endif
+
+ ALIGN (4)
+L(more32bytes):
+ cmp $40, %ecx
+ jae L(more40bytes)
+ cmp $32, %ecx
+ je L(32bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $33, %ecx
+ je L(33bytes)
+ cmp $34, %ecx
+ je L(34bytes)
+ cmp $35, %ecx
+ je L(35bytes)
+ cmp $36, %ecx
+ je L(36bytes)
+ cmp $37, %ecx
+ je L(37bytes)
+ cmp $38, %ecx
+ je L(38bytes)
+ jmp L(39bytes)
+# else
+ jmp L(36bytes)
+# endif
+
+ ALIGN (4)
+L(more40bytes):
+ cmp $40, %ecx
+ je L(40bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $41, %ecx
+ je L(41bytes)
+ cmp $42, %ecx
+ je L(42bytes)
+ cmp $43, %ecx
+ je L(43bytes)
+ cmp $44, %ecx
+ je L(44bytes)
+ cmp $45, %ecx
+ je L(45bytes)
+ cmp $46, %ecx
+ je L(46bytes)
+ jmp L(47bytes)
+
+ ALIGN (4)
+L(44bytes):
+ movl -44(%rdi), %eax
+ movl -44(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(40bytes):
+ movl -40(%rdi), %eax
+ movl -40(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(36bytes):
+ movl -36(%rdi), %eax
+ movl -36(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(32bytes):
+ movl -32(%rdi), %eax
+ movl -32(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(28bytes):
+ movl -28(%rdi), %eax
+ movl -28(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(24bytes):
+ movl -24(%rdi), %eax
+ movl -24(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(20bytes):
+ movl -20(%rdi), %eax
+ movl -20(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(16bytes):
+ movl -16(%rdi), %eax
+ movl -16(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(12bytes):
+ movl -12(%rdi), %eax
+ movl -12(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(8bytes):
+ movl -8(%rdi), %eax
+ movl -8(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(4bytes):
+ movl -4(%rdi), %eax
+ movl -4(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(0bytes):
+ xor %eax, %eax
+ ret
+# else
+ ALIGN (4)
+L(44bytes):
+ movl -44(%rdi), %eax
+ cmp -44(%rsi), %eax
+ jne L(find_diff)
+L(40bytes):
+ movl -40(%rdi), %eax
+ cmp -40(%rsi), %eax
+ jne L(find_diff)
+L(36bytes):
+ movl -36(%rdi), %eax
+ cmp -36(%rsi), %eax
+ jne L(find_diff)
+L(32bytes):
+ movl -32(%rdi), %eax
+ cmp -32(%rsi), %eax
+ jne L(find_diff)
+L(28bytes):
+ movl -28(%rdi), %eax
+ cmp -28(%rsi), %eax
+ jne L(find_diff)
+L(24bytes):
+ movl -24(%rdi), %eax
+ cmp -24(%rsi), %eax
+ jne L(find_diff)
+L(20bytes):
+ movl -20(%rdi), %eax
+ cmp -20(%rsi), %eax
+ jne L(find_diff)
+L(16bytes):
+ movl -16(%rdi), %eax
+ cmp -16(%rsi), %eax
+ jne L(find_diff)
+L(12bytes):
+ movl -12(%rdi), %eax
+ cmp -12(%rsi), %eax
+ jne L(find_diff)
+L(8bytes):
+ movl -8(%rdi), %eax
+ cmp -8(%rsi), %eax
+ jne L(find_diff)
+L(4bytes):
+ movl -4(%rdi), %eax
+ cmp -4(%rsi), %eax
+ jne L(find_diff)
+L(0bytes):
+ xor %eax, %eax
+ ret
+# endif
+
+# ifndef USE_AS_WMEMCMP
+ ALIGN (4)
+L(45bytes):
+ movl -45(%rdi), %eax
+ movl -45(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(41bytes):
+ movl -41(%rdi), %eax
+ movl -41(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(37bytes):
+ movl -37(%rdi), %eax
+ movl -37(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(33bytes):
+ movl -33(%rdi), %eax
+ movl -33(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(29bytes):
+ movl -29(%rdi), %eax
+ movl -29(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(25bytes):
+ movl -25(%rdi), %eax
+ movl -25(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(21bytes):
+ movl -21(%rdi), %eax
+ movl -21(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(17bytes):
+ movl -17(%rdi), %eax
+ movl -17(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(13bytes):
+ movl -13(%rdi), %eax
+ movl -13(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(9bytes):
+ movl -9(%rdi), %eax
+ movl -9(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(5bytes):
+ movl -5(%rdi), %eax
+ movl -5(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(1bytes):
+ movzbl -1(%rdi), %eax
+ cmpb -1(%rsi), %al
+ jne L(set)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(46bytes):
+ movl -46(%rdi), %eax
+ movl -46(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(42bytes):
+ movl -42(%rdi), %eax
+ movl -42(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(38bytes):
+ movl -38(%rdi), %eax
+ movl -38(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(34bytes):
+ movl -34(%rdi), %eax
+ movl -34(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(30bytes):
+ movl -30(%rdi), %eax
+ movl -30(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(26bytes):
+ movl -26(%rdi), %eax
+ movl -26(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(22bytes):
+ movl -22(%rdi), %eax
+ movl -22(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(18bytes):
+ movl -18(%rdi), %eax
+ movl -18(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(14bytes):
+ movl -14(%rdi), %eax
+ movl -14(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(10bytes):
+ movl -10(%rdi), %eax
+ movl -10(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(6bytes):
+ movl -6(%rdi), %eax
+ movl -6(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(2bytes):
+ movzwl -2(%rdi), %eax
+ movzwl -2(%rsi), %ecx
+ cmpb %cl, %al
+ jne L(set)
+ cmp %ecx, %eax
+ jne L(set)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(47bytes):
+ movl -47(%rdi), %eax
+ movl -47(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(43bytes):
+ movl -43(%rdi), %eax
+ movl -43(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(39bytes):
+ movl -39(%rdi), %eax
+ movl -39(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(35bytes):
+ movl -35(%rdi), %eax
+ movl -35(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(31bytes):
+ movl -31(%rdi), %eax
+ movl -31(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(27bytes):
+ movl -27(%rdi), %eax
+ movl -27(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(23bytes):
+ movl -23(%rdi), %eax
+ movl -23(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(19bytes):
+ movl -19(%rdi), %eax
+ movl -19(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(15bytes):
+ movl -15(%rdi), %eax
+ movl -15(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(11bytes):
+ movl -11(%rdi), %eax
+ movl -11(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(7bytes):
+ movl -7(%rdi), %eax
+ movl -7(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(3bytes):
+ movzwl -3(%rdi), %eax
+ movzwl -3(%rsi), %ecx
+ cmpb %cl, %al
+ jne L(set)
+ cmp %ecx, %eax
+ jne L(set)
+ movzbl -1(%rdi), %eax
+ cmpb -1(%rsi), %al
+ jne L(set)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(find_diff):
+ cmpb %cl, %al
+ jne L(set)
+ cmpw %cx, %ax
+ jne L(set)
+ shr $16, %eax
+ shr $16, %ecx
+ cmpb %cl, %al
+ jne L(set)
+
+/* We get there only if we already know there is a
+difference. */
+
+ cmp %ecx, %eax
+L(set):
+ sbb %eax, %eax
+ sbb $-1, %eax
+ ret
+# else
+
+/* for wmemcmp */
+ ALIGN (4)
+L(find_diff):
+ mov $1, %eax
+ jg L(find_diff_bigger)
+ neg %eax
+ ret
+
+ ALIGN (4)
+L(find_diff_bigger):
+ ret
+# endif
+
+ ALIGN (4)
+L(equal):
+ xor %eax, %eax
+ ret
+
+END (MEMCMP)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
index 301ab287f5..8bf8f3a417 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/x86_64/multiarch/memcmp.S
@@ -1,5 +1,5 @@
/* Multiple versions of memcmp
- Copyright (C) 2010 Free Software Foundation, Inc.
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -29,11 +29,20 @@ ENTRY(memcmp)
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
jne 1f
call __init_cpu_features
-1: leaq __memcmp_sse2(%rip), %rax
- testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
- jz 2f
+
+1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ jnz 2f
+ leaq __memcmp_sse2(%rip), %rax
+ ret
+
+2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+ jz 3f
leaq __memcmp_sse4_1(%rip), %rax
-2: ret
+ ret
+
+3: leaq __memcmp_ssse3(%rip), %rax
+ ret
+
END(memcmp)
# undef ENTRY
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-c.c b/sysdeps/x86_64/multiarch/wmemcmp-c.c
new file mode 100644
index 0000000000..793f059aff
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-c.c
@@ -0,0 +1,5 @@
+#ifndef NOT_IN_libc
+# define WMEMCMP __wmemcmp_sse2
+#endif
+
+#include "wcsmbs/wmemcmp.c"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
new file mode 100644
index 0000000000..b07973a4f6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_sse4_1
+
+#include "memcmp-sse4.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
new file mode 100644
index 0000000000..a41ef95fc1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_ssse3
+
+#include "memcmp-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S
new file mode 100644
index 0000000000..7c3b7ed178
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp.S
@@ -0,0 +1,47 @@
+/* Multiple versions of wmemcmp
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#ifndef NOT_IN_libc
+ .text
+ENTRY(wmemcmp)
+ .type wmemcmp, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+
+1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ jnz 2f
+ leaq __wmemcmp_sse2(%rip), %rax
+ ret
+
+2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+ jz 3f
+ leaq __wmemcmp_sse4_1(%rip), %rax
+ ret
+
+3: leaq __wmemcmp_ssse3(%rip), %rax
+ ret
+
+END(wmemcmp)
+#endif
diff --git a/wcsmbs/wmemcmp.c b/wcsmbs/wmemcmp.c
index c6a321b89d..e7edc87164 100644
--- a/wcsmbs/wmemcmp.c
+++ b/wcsmbs/wmemcmp.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1996, 1997 Free Software Foundation, Inc.
+/* Copyright (C) 1996, 1997i, 2011 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
@@ -19,9 +19,12 @@
#include <wchar.h>
+#ifndef WMEMCMP
+# define wmemcmp
+#endif
int
-wmemcmp (s1, s2, n)
+WMEMCMP (s1, s2, n)
const wchar_t *s1;
const wchar_t *s2;
size_t n;
@@ -34,19 +37,19 @@ wmemcmp (s1, s2, n)
c1 = (wint_t) s1[0];
c2 = (wint_t) s2[0];
if (c1 - c2 != 0)
- return c1 - c2;
+ return c1 > c2 ? 1 : -1;
c1 = (wint_t) s1[1];
c2 = (wint_t) s2[1];
if (c1 - c2 != 0)
- return c1 - c2;
+ return c1 > c2 ? 1 : -1;
c1 = (wint_t) s1[2];
c2 = (wint_t) s2[2];
if (c1 - c2 != 0)
- return c1 - c2;
+ return c1 > c2 ? 1 : -1;
c1 = (wint_t) s1[3];
c2 = (wint_t) s2[3];
if (c1 - c2 != 0)
- return c1 - c2;
+ return c1 > c2 ? 1 : -1;
s1 += 4;
s2 += 4;
n -= 4;
@@ -57,7 +60,7 @@ wmemcmp (s1, s2, n)
c1 = (wint_t) s1[0];
c2 = (wint_t) s2[0];
if (c1 - c2 != 0)
- return c1 - c2;
+ return c1 > c2 ? 1 : -1;
++s1;
++s2;
--n;
@@ -67,7 +70,7 @@ wmemcmp (s1, s2, n)
c1 = (wint_t) s1[0];
c2 = (wint_t) s2[0];
if (c1 - c2 != 0)
- return c1 - c2;
+ return c1 > c2 ? 1 : -1;
++s1;
++s2;
--n;
@@ -77,7 +80,7 @@ wmemcmp (s1, s2, n)
c1 = (wint_t) s1[0];
c2 = (wint_t) s2[0];
if (c1 - c2 != 0)
- return c1 - c2;
+ return c1 > c2 ? 1 : -1;
}
return 0;