aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
diff options
context:
space:
mode:
authorLiubov Dmitrieva <liubov.dmitrieva@gmail.com>2011-10-15 11:10:08 -0400
committerUlrich Drepper <drepper@gmail.com>2011-10-15 11:10:08 -0400
commitbe13f7bff66e1850f9057dd813d6e7be022d9516 (patch)
treed918a146db9072ad120f0010481c53d9b450c9a5 /sysdeps/i386/i686/multiarch/memcmp-ssse3.S
parent556a2007974ed39a68c87a8b5181f8057ecd0d6f (diff)
downloadglibc-be13f7bff66e1850f9057dd813d6e7be022d9516.tar
glibc-be13f7bff66e1850f9057dd813d6e7be022d9516.tar.gz
glibc-be13f7bff66e1850f9057dd813d6e7be022d9516.tar.bz2
glibc-be13f7bff66e1850f9057dd813d6e7be022d9516.zip
Optimized memcmp and wmemcmp for x86-64 and x86-32
Diffstat (limited to 'sysdeps/i386/i686/multiarch/memcmp-ssse3.S')
-rw-r--r--sysdeps/i386/i686/multiarch/memcmp-ssse3.S565
1 files changed, 371 insertions, 194 deletions
diff --git a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
index 2e0d15fe55..eab85c1de1 100644
--- a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
+++ b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
@@ -1,5 +1,5 @@
-/* memcmp with SSSE3
- Copyright (C) 2010 Free Software Foundation, Inc.
+/* memcmp with SSSE3, wmemcmp with SSSE3
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -20,47 +20,64 @@
#ifndef NOT_IN_libc
-#include <sysdep.h>
-#include "asm-syntax.h"
+# include <sysdep.h>
-#ifndef MEMCMP
-# define MEMCMP __memcmp_ssse3
-#endif
+# ifndef MEMCMP
+# define MEMCMP __memcmp_ssse3
+# endif
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
-#define CFI_PUSH(REG) \
- cfi_adjust_cfa_offset (4); \
- cfi_rel_offset (REG, 0)
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
-#define CFI_POP(REG) \
- cfi_adjust_cfa_offset (-4); \
- cfi_restore (REG)
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
-#define PUSH(REG) pushl REG; CFI_PUSH (REG)
-#define POP(REG) popl REG; CFI_POP (REG)
+# define PARMS 4
+# define BLK1 PARMS
+# define BLK2 BLK1+4
+# define LEN BLK2+4
+# define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
+# define RETURN RETURN_END; cfi_restore_state; cfi_remember_state
-#define PARMS 4
-#define BLK1 PARMS
-#define BLK2 BLK1+4
-#define LEN BLK2+4
-#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
-#define RETURN RETURN_END; cfi_restore_state; cfi_remember_state
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
- .section .text.ssse3,"ax",@progbits
+ atom_text_section
ENTRY (MEMCMP)
movl LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+ shl $2, %ecx
+ test %ecx, %ecx
+ jz L(zero)
+# endif
+
movl BLK1(%esp), %eax
cmp $48, %ecx
movl BLK2(%esp), %edx
jae L(48bytesormore)
+
+# ifndef USE_AS_WMEMCMP
cmp $1, %ecx
jbe L(less1bytes)
- PUSH (%ebx)
+# endif
+
+ PUSH (%ebx)
add %ecx, %edx
add %ecx, %eax
jmp L(less48bytes)
- ALIGN (4)
- CFI_POP (%ebx)
+ CFI_POP (%ebx)
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(less1bytes):
jb L(zero)
movb (%eax), %cl
@@ -71,29 +88,30 @@ L(less1bytes):
neg %eax
L(1bytesend):
ret
+# endif
- ALIGN (4)
+ .p2align 4
L(zero):
- mov $0, %eax
+ xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(48bytesormore):
- PUSH (%ebx)
- PUSH (%esi)
- PUSH (%edi)
+ PUSH (%ebx)
+ PUSH (%esi)
+ PUSH (%edi)
cfi_remember_state
- movdqu (%eax), %xmm3
- movdqu (%edx), %xmm0
+ movdqu (%eax), %xmm3
+ movdqu (%edx), %xmm0
movl %eax, %edi
movl %edx, %esi
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %edx
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
lea 16(%edi), %edi
- sub $0xffff, %edx
+ sub $0xffff, %edx
lea 16(%esi), %esi
- jnz L(less16bytes)
+ jnz L(less16bytes)
mov %edi, %edx
and $0xf, %edx
xor %edx, %edi
@@ -104,6 +122,7 @@ L(48bytesormore):
jz L(shr_0)
xor %edx, %esi
+# ifndef USE_AS_WMEMCMP
cmp $8, %edx
jae L(next_unaligned_table)
cmp $0, %edx
@@ -122,7 +141,7 @@ L(48bytesormore):
je L(shr_6)
jmp L(shr_7)
- ALIGN (4)
+ .p2align 2
L(next_unaligned_table):
cmp $8, %edx
je L(shr_8)
@@ -139,8 +158,17 @@ L(next_unaligned_table):
cmp $14, %edx
je L(shr_14)
jmp L(shr_15)
+# else
+ cmp $0, %edx
+ je L(shr_0)
+ cmp $4, %edx
+ je L(shr_4)
+ cmp $8, %edx
+ je L(shr_8)
+ jmp L(shr_12)
+# endif
- ALIGN (4)
+ .p2align 4
L(shr_0):
cmp $80, %ecx
jae L(shr_0_gobble)
@@ -159,13 +187,13 @@ L(shr_0):
lea (%ecx, %edi,1), %eax
lea (%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_0_gobble):
lea -48(%ecx), %ecx
movdqa (%esi), %xmm0
@@ -205,13 +233,14 @@ L(shr_0_gobble_loop_next):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea (%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_1):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -235,13 +264,13 @@ L(shr_1):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 1(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_1_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -288,14 +317,14 @@ L(shr_1_gobble_next):
lea (%ecx, %edi,1), %eax
lea 1(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_2):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -319,13 +348,13 @@ L(shr_2):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 2(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_2_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -372,13 +401,13 @@ L(shr_2_gobble_next):
lea (%ecx, %edi,1), %eax
lea 2(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_3):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -402,13 +431,13 @@ L(shr_3):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 3(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_3_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -455,13 +484,14 @@ L(shr_3_gobble_next):
lea (%ecx, %edi,1), %eax
lea 3(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# endif
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_4):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -485,13 +515,13 @@ L(shr_4):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 4(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_4_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -538,13 +568,14 @@ L(shr_4_gobble_next):
lea (%ecx, %edi,1), %eax
lea 4(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_5):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -568,13 +599,13 @@ L(shr_5):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 5(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_5_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -621,13 +652,13 @@ L(shr_5_gobble_next):
lea (%ecx, %edi,1), %eax
lea 5(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_6):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -651,13 +682,13 @@ L(shr_6):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 6(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_6_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -704,13 +735,13 @@ L(shr_6_gobble_next):
lea (%ecx, %edi,1), %eax
lea 6(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_7):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -734,13 +765,13 @@ L(shr_7):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 7(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_7_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -787,13 +818,14 @@ L(shr_7_gobble_next):
lea (%ecx, %edi,1), %eax
lea 7(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# endif
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_8):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -817,13 +849,13 @@ L(shr_8):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 8(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_8_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -870,13 +902,14 @@ L(shr_8_gobble_next):
lea (%ecx, %edi,1), %eax
lea 8(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_9):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -900,13 +933,13 @@ L(shr_9):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 9(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_9_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -953,13 +986,13 @@ L(shr_9_gobble_next):
lea (%ecx, %edi,1), %eax
lea 9(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_10):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -983,13 +1016,13 @@ L(shr_10):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 10(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_10_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1036,13 +1069,13 @@ L(shr_10_gobble_next):
lea (%ecx, %edi,1), %eax
lea 10(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_11):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1066,13 +1099,13 @@ L(shr_11):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 11(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_11_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1119,13 +1152,14 @@ L(shr_11_gobble_next):
lea (%ecx, %edi,1), %eax
lea 11(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# endif
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_12):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1149,13 +1183,13 @@ L(shr_12):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 12(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_12_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1202,13 +1236,14 @@ L(shr_12_gobble_next):
lea (%ecx, %edi,1), %eax
lea 12(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_13):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1232,13 +1267,13 @@ L(shr_13):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 13(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_13_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1285,13 +1320,13 @@ L(shr_13_gobble_next):
lea (%ecx, %edi,1), %eax
lea 13(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_14):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1315,13 +1350,13 @@ L(shr_14):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 14(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_14_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1368,13 +1403,13 @@ L(shr_14_gobble_next):
lea (%ecx, %edi,1), %eax
lea 14(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_15):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1398,13 +1433,13 @@ L(shr_15):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 15(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_15_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1451,13 +1486,14 @@ L(shr_15_gobble_next):
lea (%ecx, %edi,1), %eax
lea 15(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# endif
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(exit):
pmovmskb %xmm1, %ebx
sub $0xffff, %ebx
@@ -1465,9 +1501,12 @@ L(exit):
lea -16(%esi), %esi
lea -16(%edi), %edi
mov %ebx, %edx
+
L(first16bytes):
add %eax, %esi
L(less16bytes):
+
+# ifndef USE_AS_WMEMCMP
test %dl, %dl
jz L(next_24_bytes)
@@ -1492,61 +1531,61 @@ L(less16bytes):
test $0x40, %dl
jnz L(Byte22)
L(Byte23):
- movzbl -9(%edi), %eax
- movzbl -9(%esi), %edx
+ movzbl -9(%edi), %eax
+ movzbl -9(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte16):
- movzbl -16(%edi), %eax
- movzbl -16(%esi), %edx
+ movzbl -16(%edi), %eax
+ movzbl -16(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte17):
- movzbl -15(%edi), %eax
- movzbl -15(%esi), %edx
+ movzbl -15(%edi), %eax
+ movzbl -15(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte18):
- movzbl -14(%edi), %eax
- movzbl -14(%esi), %edx
+ movzbl -14(%edi), %eax
+ movzbl -14(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte19):
- movzbl -13(%edi), %eax
- movzbl -13(%esi), %edx
+ movzbl -13(%edi), %eax
+ movzbl -13(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte20):
- movzbl -12(%edi), %eax
- movzbl -12(%esi), %edx
+ movzbl -12(%edi), %eax
+ movzbl -12(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte21):
- movzbl -11(%edi), %eax
- movzbl -11(%esi), %edx
+ movzbl -11(%edi), %eax
+ movzbl -11(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte22):
- movzbl -10(%edi), %eax
- movzbl -10(%esi), %edx
+ movzbl -10(%edi), %eax
+ movzbl -10(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(next_24_bytes):
lea 8(%edi), %edi
lea 8(%esi), %esi
@@ -1571,20 +1610,69 @@ L(next_24_bytes):
test $0x40, %dh
jnz L(Byte22)
- ALIGN (4)
+ .p2align 4
L(Byte31):
- movzbl -9(%edi), %eax
- movzbl -9(%esi), %edx
+ movzbl -9(%edi), %eax
+ movzbl -9(%esi), %edx
sub %edx, %eax
RETURN_END
+# else
+
+/* special for wmemcmp */
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words)
+ and $15, %dl
+ jz L(second_double_word)
+ mov -16(%edi), %eax
+ cmp -16(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(second_double_word):
+ mov -12(%edi), %eax
+ cmp -12(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(next_two_double_words):
+ and $15, %dh
+ jz L(fourth_double_word)
+ mov -8(%edi), %eax
+ cmp -8(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(fourth_double_word):
+ mov -4(%edi), %eax
+ cmp -4(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(nequal):
+ mov $1, %eax
+ jg L(nequal_bigger)
+ neg %eax
+ RETURN
+
+ .p2align 4
+L(nequal_bigger):
+ RETURN_END
+# endif
CFI_PUSH (%ebx)
- ALIGN (4)
+
+ .p2align 4
L(more8bytes):
cmp $16, %ecx
jae L(more16bytes)
cmp $8, %ecx
je L(8bytes)
+# ifndef USE_AS_WMEMCMP
cmp $9, %ecx
je L(9bytes)
cmp $10, %ecx
@@ -1598,13 +1686,17 @@ L(more8bytes):
cmp $14, %ecx
je L(14bytes)
jmp L(15bytes)
+# else
+ jmp L(12bytes)
+# endif
- ALIGN (4)
+ .p2align 4
L(more16bytes):
cmp $24, %ecx
jae L(more24bytes)
cmp $16, %ecx
je L(16bytes)
+# ifndef USE_AS_WMEMCMP
cmp $17, %ecx
je L(17bytes)
cmp $18, %ecx
@@ -1618,13 +1710,17 @@ L(more16bytes):
cmp $22, %ecx
je L(22bytes)
jmp L(23bytes)
+# else
+ jmp L(20bytes)
+# endif
- ALIGN (4)
+ .p2align 4
L(more24bytes):
cmp $32, %ecx
jae L(more32bytes)
cmp $24, %ecx
je L(24bytes)
+# ifndef USE_AS_WMEMCMP
cmp $25, %ecx
je L(25bytes)
cmp $26, %ecx
@@ -1638,13 +1734,17 @@ L(more24bytes):
cmp $30, %ecx
je L(30bytes)
jmp L(31bytes)
+# else
+ jmp L(28bytes)
+# endif
- ALIGN (4)
+ .p2align 4
L(more32bytes):
cmp $40, %ecx
jae L(more40bytes)
cmp $32, %ecx
je L(32bytes)
+# ifndef USE_AS_WMEMCMP
cmp $33, %ecx
je L(33bytes)
cmp $34, %ecx
@@ -1658,11 +1758,35 @@ L(more32bytes):
cmp $38, %ecx
je L(38bytes)
jmp L(39bytes)
+# else
+ jmp L(36bytes)
+# endif
+
+ .p2align 4
+L(less48bytes):
+ cmp $8, %ecx
+ jae L(more8bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $2, %ecx
+ je L(2bytes)
+ cmp $3, %ecx
+ je L(3bytes)
+ cmp $4, %ecx
+ je L(4bytes)
+ cmp $5, %ecx
+ je L(5bytes)
+ cmp $6, %ecx
+ je L(6bytes)
+ jmp L(7bytes)
+# else
+ jmp L(4bytes)
+# endif
- ALIGN (4)
+ .p2align 4
L(more40bytes):
cmp $40, %ecx
je L(40bytes)
+# ifndef USE_AS_WMEMCMP
cmp $41, %ecx
je L(41bytes)
cmp $42, %ecx
@@ -1677,23 +1801,7 @@ L(more40bytes):
je L(46bytes)
jmp L(47bytes)
- ALIGN (4)
-L(less48bytes):
- cmp $8, %ecx
- jae L(more8bytes)
- cmp $2, %ecx
- je L(2bytes)
- cmp $3, %ecx
- je L(3bytes)
- cmp $4, %ecx
- je L(4bytes)
- cmp $5, %ecx
- je L(5bytes)
- cmp $6, %ecx
- je L(6bytes)
- jmp L(7bytes)
-
- ALIGN (4)
+ .p2align 4
L(44bytes):
mov -44(%eax), %ecx
mov -44(%edx), %ebx
@@ -1750,11 +1858,64 @@ L(4bytes):
cmp %ebx, %ecx
mov $0, %eax
jne L(find_diff)
- POP (%ebx)
+ POP (%ebx)
+ ret
+ CFI_PUSH (%ebx)
+# else
+ .p2align 4
+L(44bytes):
+ mov -44(%eax), %ecx
+ cmp -44(%edx), %ecx
+ jne L(find_diff)
+L(40bytes):
+ mov -40(%eax), %ecx
+ cmp -40(%edx), %ecx
+ jne L(find_diff)
+L(36bytes):
+ mov -36(%eax), %ecx
+ cmp -36(%edx), %ecx
+ jne L(find_diff)
+L(32bytes):
+ mov -32(%eax), %ecx
+ cmp -32(%edx), %ecx
+ jne L(find_diff)
+L(28bytes):
+ mov -28(%eax), %ecx
+ cmp -28(%edx), %ecx
+ jne L(find_diff)
+L(24bytes):
+ mov -24(%eax), %ecx
+ cmp -24(%edx), %ecx
+ jne L(find_diff)
+L(20bytes):
+ mov -20(%eax), %ecx
+ cmp -20(%edx), %ecx
+ jne L(find_diff)
+L(16bytes):
+ mov -16(%eax), %ecx
+ cmp -16(%edx), %ecx
+ jne L(find_diff)
+L(12bytes):
+ mov -12(%eax), %ecx
+ cmp -12(%edx), %ecx
+ jne L(find_diff)
+L(8bytes):
+ mov -8(%eax), %ecx
+ cmp -8(%edx), %ecx
+ jne L(find_diff)
+L(4bytes):
+ mov -4(%eax), %ecx
+ xor %eax, %eax
+ cmp -4(%edx), %ecx
+ jne L(find_diff)
+ POP (%ebx)
ret
CFI_PUSH (%ebx)
+# endif
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+
+ .p2align 4
L(45bytes):
mov -45(%eax), %ecx
mov -45(%edx), %ebx
@@ -1814,11 +1975,11 @@ L(5bytes):
cmp -1(%edx), %cl
mov $0, %eax
jne L(end)
- POP (%ebx)
+ POP (%ebx)
ret
CFI_PUSH (%ebx)
- ALIGN (4)
+ .p2align 4
L(46bytes):
mov -46(%eax), %ecx
mov -46(%edx), %ebx
@@ -1882,11 +2043,11 @@ L(2bytes):
cmp %bh, %ch
mov $0, %eax
jne L(end)
- POP (%ebx)
+ POP (%ebx)
ret
CFI_PUSH (%ebx)
- ALIGN (4)
+ .p2align 4
L(47bytes):
movl -47(%eax), %ecx
movl -47(%edx), %ebx
@@ -1953,11 +2114,11 @@ L(3bytes):
cmpb -1(%edx), %al
mov $0, %eax
jne L(end)
- POP (%ebx)
+ POP (%ebx)
ret
CFI_PUSH (%ebx)
- ALIGN (4)
+ .p2align 4
L(find_diff):
cmpb %bl, %cl
jne L(end)
@@ -1968,14 +2129,30 @@ L(find_diff):
cmp %bl, %cl
jne L(end)
cmp %bx, %cx
+
+ .p2align 4
L(end):
- POP (%ebx)
+ POP (%ebx)
mov $1, %eax
ja L(bigger)
neg %eax
L(bigger):
ret
+# else
-END (MEMCMP)
+/* for wmemcmp */
+ .p2align 4
+L(find_diff):
+ POP (%ebx)
+ mov $1, %eax
+ jg L(find_diff_bigger)
+ neg %eax
+ ret
+ .p2align 4
+L(find_diff_bigger):
+ ret
+
+# endif
+END (MEMCMP)
#endif