diff options
author | Liubov Dmitrieva <liubov.dmitrieva@gmail.com> | 2011-10-15 11:10:08 -0400 |
---|---|---|
committer | Ulrich Drepper <drepper@gmail.com> | 2011-10-15 11:10:08 -0400 |
commit | be13f7bff66e1850f9057dd813d6e7be022d9516 (patch) | |
tree | d918a146db9072ad120f0010481c53d9b450c9a5 /sysdeps/i386/i686/multiarch/memcmp-ssse3.S | |
parent | 556a2007974ed39a68c87a8b5181f8057ecd0d6f (diff) | |
download | glibc-be13f7bff66e1850f9057dd813d6e7be022d9516.tar glibc-be13f7bff66e1850f9057dd813d6e7be022d9516.tar.gz glibc-be13f7bff66e1850f9057dd813d6e7be022d9516.tar.bz2 glibc-be13f7bff66e1850f9057dd813d6e7be022d9516.zip |
Optimized memcmp and wmemcmp for x86-64 and x86-32
Diffstat (limited to 'sysdeps/i386/i686/multiarch/memcmp-ssse3.S')
-rw-r--r-- | sysdeps/i386/i686/multiarch/memcmp-ssse3.S | 565 |
1 files changed, 371 insertions, 194 deletions
diff --git a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S index 2e0d15fe55..eab85c1de1 100644 --- a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S +++ b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S @@ -1,5 +1,5 @@ -/* memcmp with SSSE3 - Copyright (C) 2010 Free Software Foundation, Inc. +/* memcmp with SSSE3, wmemcmp with SSSE3 + Copyright (C) 2010, 2011 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -20,47 +20,64 @@ #ifndef NOT_IN_libc -#include <sysdep.h> -#include "asm-syntax.h" +# include <sysdep.h> -#ifndef MEMCMP -# define MEMCMP __memcmp_ssse3 -#endif +# ifndef MEMCMP +# define MEMCMP __memcmp_ssse3 +# endif + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) -#define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) -#define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) -#define PUSH(REG) pushl REG; CFI_PUSH (REG) -#define POP(REG) popl REG; CFI_POP (REG) +# define PARMS 4 +# define BLK1 PARMS +# define BLK2 BLK1+4 +# define LEN BLK2+4 +# define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret +# define RETURN RETURN_END; cfi_restore_state; cfi_remember_state -#define PARMS 4 -#define BLK1 PARMS -#define BLK2 BLK1+4 -#define LEN BLK2+4 -#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret -#define RETURN RETURN_END; cfi_restore_state; cfi_remember_state +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ - .section .text.ssse3,"ax",@progbits + atom_text_section ENTRY (MEMCMP) movl LEN(%esp), %ecx + +# ifdef USE_AS_WMEMCMP + shl $2, %ecx + test %ecx, %ecx + jz L(zero) +# endif + movl BLK1(%esp), %eax cmp $48, %ecx movl BLK2(%esp), %edx jae L(48bytesormore) + +# ifndef USE_AS_WMEMCMP cmp $1, %ecx jbe L(less1bytes) - PUSH (%ebx) +# endif + + PUSH (%ebx) add %ecx, %edx add %ecx, %eax jmp L(less48bytes) - ALIGN (4) - CFI_POP (%ebx) + CFI_POP (%ebx) + +# ifndef USE_AS_WMEMCMP + .p2align 4 L(less1bytes): jb L(zero) movb (%eax), %cl @@ -71,29 +88,30 @@ L(less1bytes): neg %eax L(1bytesend): ret +# endif - ALIGN (4) + .p2align 4 L(zero): - mov $0, %eax + xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(48bytesormore): - PUSH (%ebx) - PUSH (%esi) - PUSH (%edi) + PUSH (%ebx) + PUSH (%esi) + PUSH (%edi) cfi_remember_state - movdqu (%eax), %xmm3 - movdqu (%edx), %xmm0 + movdqu (%eax), %xmm3 + movdqu (%edx), %xmm0 movl %eax, %edi movl %edx, %esi - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %edx + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx lea 16(%edi), %edi - sub $0xffff, %edx + sub $0xffff, %edx lea 16(%esi), %esi - jnz L(less16bytes) + jnz L(less16bytes) mov %edi, %edx and $0xf, %edx xor %edx, %edi @@ -104,6 +122,7 @@ L(48bytesormore): jz L(shr_0) xor %edx, %esi +# ifndef USE_AS_WMEMCMP cmp $8, %edx jae L(next_unaligned_table) cmp $0, %edx @@ -122,7 +141,7 @@ L(48bytesormore): je L(shr_6) jmp L(shr_7) - ALIGN (4) + .p2align 2 L(next_unaligned_table): cmp $8, %edx je L(shr_8) @@ -139,8 +158,17 @@ L(next_unaligned_table): cmp $14, %edx je L(shr_14) jmp L(shr_15) +# else + cmp $0, %edx + je L(shr_0) + cmp $4, %edx + je L(shr_4) + cmp $8, %edx + je L(shr_8) + jmp L(shr_12) +# endif - ALIGN (4) + .p2align 4 L(shr_0): cmp $80, %ecx jae L(shr_0_gobble) @@ -159,13 +187,13 @@ L(shr_0): lea (%ecx, %edi,1), %eax lea (%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_0_gobble): lea -48(%ecx), %ecx movdqa (%esi), %xmm0 @@ -205,13 +233,14 @@ L(shr_0_gobble_loop_next): jnz L(exit) lea (%ecx, %edi,1), %eax lea (%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# ifndef USE_AS_WMEMCMP cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_1): cmp $80, %ecx lea -48(%ecx), %ecx @@ -235,13 +264,13 @@ L(shr_1): jnz L(exit) lea (%ecx, %edi,1), %eax lea 1(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_1_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -288,14 +317,14 @@ L(shr_1_gobble_next): lea (%ecx, %edi,1), %eax lea 1(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_2): cmp $80, %ecx lea -48(%ecx), %ecx @@ -319,13 +348,13 @@ L(shr_2): jnz L(exit) lea (%ecx, %edi,1), %eax lea 2(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_2_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -372,13 +401,13 @@ L(shr_2_gobble_next): lea (%ecx, %edi,1), %eax lea 2(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_3): cmp $80, %ecx lea -48(%ecx), %ecx @@ -402,13 +431,13 @@ L(shr_3): jnz L(exit) lea (%ecx, %edi,1), %eax lea 3(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_3_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -455,13 +484,14 @@ L(shr_3_gobble_next): lea (%ecx, %edi,1), %eax lea 3(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# endif cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_4): cmp $80, %ecx lea -48(%ecx), %ecx @@ -485,13 +515,13 @@ L(shr_4): jnz L(exit) lea (%ecx, %edi,1), %eax lea 4(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_4_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -538,13 +568,14 @@ L(shr_4_gobble_next): lea (%ecx, %edi,1), %eax lea 4(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# ifndef USE_AS_WMEMCMP cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_5): cmp $80, %ecx lea -48(%ecx), %ecx @@ -568,13 +599,13 @@ L(shr_5): jnz L(exit) lea (%ecx, %edi,1), %eax lea 5(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_5_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -621,13 +652,13 @@ L(shr_5_gobble_next): lea (%ecx, %edi,1), %eax lea 5(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_6): cmp $80, %ecx lea -48(%ecx), %ecx @@ -651,13 +682,13 @@ L(shr_6): jnz L(exit) lea (%ecx, %edi,1), %eax lea 6(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_6_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -704,13 +735,13 @@ L(shr_6_gobble_next): lea (%ecx, %edi,1), %eax lea 6(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_7): cmp $80, %ecx lea -48(%ecx), %ecx @@ -734,13 +765,13 @@ L(shr_7): jnz L(exit) lea (%ecx, %edi,1), %eax lea 7(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_7_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -787,13 +818,14 @@ L(shr_7_gobble_next): lea (%ecx, %edi,1), %eax lea 7(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# endif cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_8): cmp $80, %ecx lea -48(%ecx), %ecx @@ -817,13 +849,13 @@ L(shr_8): jnz L(exit) lea (%ecx, %edi,1), %eax lea 8(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_8_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -870,13 +902,14 @@ L(shr_8_gobble_next): lea (%ecx, %edi,1), %eax lea 8(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# ifndef USE_AS_WMEMCMP cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_9): cmp $80, %ecx lea -48(%ecx), %ecx @@ -900,13 +933,13 @@ L(shr_9): jnz L(exit) lea (%ecx, %edi,1), %eax lea 9(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_9_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -953,13 +986,13 @@ L(shr_9_gobble_next): lea (%ecx, %edi,1), %eax lea 9(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_10): cmp $80, %ecx lea -48(%ecx), %ecx @@ -983,13 +1016,13 @@ L(shr_10): jnz L(exit) lea (%ecx, %edi,1), %eax lea 10(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_10_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1036,13 +1069,13 @@ L(shr_10_gobble_next): lea (%ecx, %edi,1), %eax lea 10(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_11): cmp $80, %ecx lea -48(%ecx), %ecx @@ -1066,13 +1099,13 @@ L(shr_11): jnz L(exit) lea (%ecx, %edi,1), %eax lea 11(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_11_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1119,13 +1152,14 @@ L(shr_11_gobble_next): lea (%ecx, %edi,1), %eax lea 11(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# endif cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_12): cmp $80, %ecx lea -48(%ecx), %ecx @@ -1149,13 +1183,13 @@ L(shr_12): jnz L(exit) lea (%ecx, %edi,1), %eax lea 12(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_12_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1202,13 +1236,14 @@ L(shr_12_gobble_next): lea (%ecx, %edi,1), %eax lea 12(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# ifndef USE_AS_WMEMCMP cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_13): cmp $80, %ecx lea -48(%ecx), %ecx @@ -1232,13 +1267,13 @@ L(shr_13): jnz L(exit) lea (%ecx, %edi,1), %eax lea 13(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_13_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1285,13 +1320,13 @@ L(shr_13_gobble_next): lea (%ecx, %edi,1), %eax lea 13(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_14): cmp $80, %ecx lea -48(%ecx), %ecx @@ -1315,13 +1350,13 @@ L(shr_14): jnz L(exit) lea (%ecx, %edi,1), %eax lea 14(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_14_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1368,13 +1403,13 @@ L(shr_14_gobble_next): lea (%ecx, %edi,1), %eax lea 14(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_15): cmp $80, %ecx lea -48(%ecx), %ecx @@ -1398,13 +1433,13 @@ L(shr_15): jnz L(exit) lea (%ecx, %edi,1), %eax lea 15(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_15_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1451,13 +1486,14 @@ L(shr_15_gobble_next): lea (%ecx, %edi,1), %eax lea 15(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# endif cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(exit): pmovmskb %xmm1, %ebx sub $0xffff, %ebx @@ -1465,9 +1501,12 @@ L(exit): lea -16(%esi), %esi lea -16(%edi), %edi mov %ebx, %edx + L(first16bytes): add %eax, %esi L(less16bytes): + +# ifndef USE_AS_WMEMCMP test %dl, %dl jz L(next_24_bytes) @@ -1492,61 +1531,61 @@ L(less16bytes): test $0x40, %dl jnz L(Byte22) L(Byte23): - movzbl -9(%edi), %eax - movzbl -9(%esi), %edx + movzbl -9(%edi), %eax + movzbl -9(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte16): - movzbl -16(%edi), %eax - movzbl -16(%esi), %edx + movzbl -16(%edi), %eax + movzbl -16(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte17): - movzbl -15(%edi), %eax - movzbl -15(%esi), %edx + movzbl -15(%edi), %eax + movzbl -15(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte18): - movzbl -14(%edi), %eax - movzbl -14(%esi), %edx + movzbl -14(%edi), %eax + movzbl -14(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte19): - movzbl -13(%edi), %eax - movzbl -13(%esi), %edx + movzbl -13(%edi), %eax + movzbl -13(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte20): - movzbl -12(%edi), %eax - movzbl -12(%esi), %edx + movzbl -12(%edi), %eax + movzbl -12(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte21): - movzbl -11(%edi), %eax - movzbl -11(%esi), %edx + movzbl -11(%edi), %eax + movzbl -11(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte22): - movzbl -10(%edi), %eax - movzbl -10(%esi), %edx + movzbl -10(%edi), %eax + movzbl -10(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(next_24_bytes): lea 8(%edi), %edi lea 8(%esi), %esi @@ -1571,20 +1610,69 @@ L(next_24_bytes): test $0x40, %dh jnz L(Byte22) - ALIGN (4) + .p2align 4 L(Byte31): - movzbl -9(%edi), %eax - movzbl -9(%esi), %edx + movzbl -9(%edi), %eax + movzbl -9(%esi), %edx sub %edx, %eax RETURN_END +# else + +/* special for wmemcmp */ + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words) + and $15, %dl + jz L(second_double_word) + mov -16(%edi), %eax + cmp -16(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(second_double_word): + mov -12(%edi), %eax + cmp -12(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(next_two_double_words): + and $15, %dh + jz L(fourth_double_word) + mov -8(%edi), %eax + cmp -8(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(fourth_double_word): + mov -4(%edi), %eax + cmp -4(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(nequal): + mov $1, %eax + jg L(nequal_bigger) + neg %eax + RETURN + + .p2align 4 +L(nequal_bigger): + RETURN_END +# endif CFI_PUSH (%ebx) - ALIGN (4) + + .p2align 4 L(more8bytes): cmp $16, %ecx jae L(more16bytes) cmp $8, %ecx je L(8bytes) +# ifndef USE_AS_WMEMCMP cmp $9, %ecx je L(9bytes) cmp $10, %ecx @@ -1598,13 +1686,17 @@ L(more8bytes): cmp $14, %ecx je L(14bytes) jmp L(15bytes) +# else + jmp L(12bytes) +# endif - ALIGN (4) + .p2align 4 L(more16bytes): cmp $24, %ecx jae L(more24bytes) cmp $16, %ecx je L(16bytes) +# ifndef USE_AS_WMEMCMP cmp $17, %ecx je L(17bytes) cmp $18, %ecx @@ -1618,13 +1710,17 @@ L(more16bytes): cmp $22, %ecx je L(22bytes) jmp L(23bytes) +# else + jmp L(20bytes) +# endif - ALIGN (4) + .p2align 4 L(more24bytes): cmp $32, %ecx jae L(more32bytes) cmp $24, %ecx je L(24bytes) +# ifndef USE_AS_WMEMCMP cmp $25, %ecx je L(25bytes) cmp $26, %ecx @@ -1638,13 +1734,17 @@ L(more24bytes): cmp $30, %ecx je L(30bytes) jmp L(31bytes) +# else + jmp L(28bytes) +# endif - ALIGN (4) + .p2align 4 L(more32bytes): cmp $40, %ecx jae L(more40bytes) cmp $32, %ecx je L(32bytes) +# ifndef USE_AS_WMEMCMP cmp $33, %ecx je L(33bytes) cmp $34, %ecx @@ -1658,11 +1758,35 @@ L(more32bytes): cmp $38, %ecx je L(38bytes) jmp L(39bytes) +# else + jmp L(36bytes) +# endif + + .p2align 4 +L(less48bytes): + cmp $8, %ecx + jae L(more8bytes) +# ifndef USE_AS_WMEMCMP + cmp $2, %ecx + je L(2bytes) + cmp $3, %ecx + je L(3bytes) + cmp $4, %ecx + je L(4bytes) + cmp $5, %ecx + je L(5bytes) + cmp $6, %ecx + je L(6bytes) + jmp L(7bytes) +# else + jmp L(4bytes) +# endif - ALIGN (4) + .p2align 4 L(more40bytes): cmp $40, %ecx je L(40bytes) +# ifndef USE_AS_WMEMCMP cmp $41, %ecx je L(41bytes) cmp $42, %ecx @@ -1677,23 +1801,7 @@ L(more40bytes): je L(46bytes) jmp L(47bytes) - ALIGN (4) -L(less48bytes): - cmp $8, %ecx - jae L(more8bytes) - cmp $2, %ecx - je L(2bytes) - cmp $3, %ecx - je L(3bytes) - cmp $4, %ecx - je L(4bytes) - cmp $5, %ecx - je L(5bytes) - cmp $6, %ecx - je L(6bytes) - jmp L(7bytes) - - ALIGN (4) + .p2align 4 L(44bytes): mov -44(%eax), %ecx mov -44(%edx), %ebx @@ -1750,11 +1858,64 @@ L(4bytes): cmp %ebx, %ecx mov $0, %eax jne L(find_diff) - POP (%ebx) + POP (%ebx) + ret + CFI_PUSH (%ebx) +# else + .p2align 4 +L(44bytes): + mov -44(%eax), %ecx + cmp -44(%edx), %ecx + jne L(find_diff) +L(40bytes): + mov -40(%eax), %ecx + cmp -40(%edx), %ecx + jne L(find_diff) +L(36bytes): + mov -36(%eax), %ecx + cmp -36(%edx), %ecx + jne L(find_diff) +L(32bytes): + mov -32(%eax), %ecx + cmp -32(%edx), %ecx + jne L(find_diff) +L(28bytes): + mov -28(%eax), %ecx + cmp -28(%edx), %ecx + jne L(find_diff) +L(24bytes): + mov -24(%eax), %ecx + cmp -24(%edx), %ecx + jne L(find_diff) +L(20bytes): + mov -20(%eax), %ecx + cmp -20(%edx), %ecx + jne L(find_diff) +L(16bytes): + mov -16(%eax), %ecx + cmp -16(%edx), %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + cmp -12(%edx), %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + cmp -8(%edx), %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + xor %eax, %eax + cmp -4(%edx), %ecx + jne L(find_diff) + POP (%ebx) ret CFI_PUSH (%ebx) +# endif - ALIGN (4) +# ifndef USE_AS_WMEMCMP + + .p2align 4 L(45bytes): mov -45(%eax), %ecx mov -45(%edx), %ebx @@ -1814,11 +1975,11 @@ L(5bytes): cmp -1(%edx), %cl mov $0, %eax jne L(end) - POP (%ebx) + POP (%ebx) ret CFI_PUSH (%ebx) - ALIGN (4) + .p2align 4 L(46bytes): mov -46(%eax), %ecx mov -46(%edx), %ebx @@ -1882,11 +2043,11 @@ L(2bytes): cmp %bh, %ch mov $0, %eax jne L(end) - POP (%ebx) + POP (%ebx) ret CFI_PUSH (%ebx) - ALIGN (4) + .p2align 4 L(47bytes): movl -47(%eax), %ecx movl -47(%edx), %ebx @@ -1953,11 +2114,11 @@ L(3bytes): cmpb -1(%edx), %al mov $0, %eax jne L(end) - POP (%ebx) + POP (%ebx) ret CFI_PUSH (%ebx) - ALIGN (4) + .p2align 4 L(find_diff): cmpb %bl, %cl jne L(end) @@ -1968,14 +2129,30 @@ L(find_diff): cmp %bl, %cl jne L(end) cmp %bx, %cx + + .p2align 4 L(end): - POP (%ebx) + POP (%ebx) mov $1, %eax ja L(bigger) neg %eax L(bigger): ret +# else -END (MEMCMP) +/* for wmemcmp */ + .p2align 4 +L(find_diff): + POP (%ebx) + mov $1, %eax + jg L(find_diff_bigger) + neg %eax + ret + .p2align 4 +L(find_diff_bigger): + ret + +# endif +END (MEMCMP) #endif |