diff options
author | Ondrej Bilka <neleai@seznam.cz> | 2013-03-06 22:27:18 +0100 |
---|---|---|
committer | Ondrej Bilka <neleai@seznam.cz> | 2013-03-06 22:27:18 +0100 |
commit | 87bd9bc4bd2a49a441bb9ba744c9ddb0c9434823 (patch) | |
tree | ddef45a58945fed230d26a155bbc10739b3fa864 | |
parent | b79188d71716b6286866e06add976fe84100595e (diff) | |
download | glibc-87bd9bc4bd2a49a441bb9ba744c9ddb0c9434823.tar glibc-87bd9bc4bd2a49a441bb9ba744c9ddb0c9434823.tar.gz glibc-87bd9bc4bd2a49a441bb9ba744c9ddb0c9434823.tar.bz2 glibc-87bd9bc4bd2a49a441bb9ba744c9ddb0c9434823.zip |
Revert " * sysdeps/x86_64/strlen.S: Replace with new SSE2 based implementation"
This reverts commit b79188d71716b6286866e06add976fe84100595e.
-rw-r--r-- | ChangeLog | 23 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 6 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/ifunc-impl-list.c | 13 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S | 229 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcat-ssse3.S | 312 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S | 685 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strlen-sse2-pminub.S | 259 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strlen-sse4.S | 84 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strlen.S | 68 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S | 3 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strnlen.S | 57 | ||||
-rw-r--r-- | sysdeps/x86_64/strlen.S | 263 | ||||
-rw-r--r-- | sysdeps/x86_64/strnlen.S | 67 |
13 files changed, 1306 insertions, 763 deletions
@@ -1,26 +1,3 @@ -2013-03-06 Ondrej Bilka <neleai@seznam.cz> - - * sysdeps/x86_64/strlen.S: Replace with new SSE2 based - implementation which is faster on all x86_64 architectures. - Tested on AMD, Intel Nehalem, SNB, IVB. - * sysdeps/x86_64/strnlen.S: Likewise. - - * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): - Remove all multiarch strlen and strnlen versions. - * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Update. - Remove strlen and strnlen related parts. - - * sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S: Update. - Inline strlen part. - * sysdeps/x86_64/multiarch/strcat-ssse3.S: Likewise. - - * sysdeps/x86_64/multiarch/strlen.S: Remove. - * sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S: Likewise. - * sysdeps/x86_64/multiarch/strlen-sse2-pminub.S: Likewise. - * sysdeps/x86_64/multiarch/strlen-sse4.S: Likewise. - * sysdeps/x86_64/multiarch/strnlen.S: Likewise. - * sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S: Likewise. - 2013-03-06 Patsy Franklin <pfrankli@redhat.com> * io/fcntl.h: Added a comment about AT_EACCESS and AT_REMOVEDIR. diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 67686add61..dd6c27d0b4 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -10,12 +10,14 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \ - strncase_l-ssse3 memset-x86-64 strcat-ssse3 strncat-ssse3\ + strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf memset-x86-64 \ strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ strcpy-sse2-unaligned strncpy-sse2-unaligned \ stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ strcat-sse2-unaligned strncat-sse2-unaligned \ - strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3 + strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \ + strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \ + memcmp-ssse3 ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift CFLAGS-varshift.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 848991eac2..643cb2dd0a 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -187,6 +187,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __strncpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2)) + /* Support sysdeps/x86_64/multiarch/strnlen.S. */ + IFUNC_IMPL (i, name, strnlen, + IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2_no_bsf) + IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2)) + /* Support sysdeps/x86_64/multiarch/strpbrk.S. */ IFUNC_IMPL (i, name, strpbrk, IFUNC_IMPL_ADD (array, i, strpbrk, HAS_SSE4_2, @@ -257,6 +262,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __mempcpy_ssse3) IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2)) + /* Support sysdeps/x86_64/multiarch/strlen.S. */ + IFUNC_IMPL (i, name, strlen, + IFUNC_IMPL_ADD (array, i, strlen, HAS_SSE4_2, __strlen_sse42) + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2_pminub) + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2_no_bsf) + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2) + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2)) + /* Support sysdeps/x86_64/multiarch/strncmp.S. */ IFUNC_IMPL (i, name, strncmp, IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSE4_2, diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S index 6d9951e89f..72bb609949 100644 --- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S @@ -34,233 +34,10 @@ ENTRY (STRCAT) mov %rdx, %r8 # endif - xor %rax, %rax - mov %edi, %ecx - and $0x3f, %ecx - pxor %xmm0, %xmm0 - cmp $0x30, %ecx - ja L(next) - movdqu (%rdi), %xmm1 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit_less16) - mov %rdi, %rax - and $-16, %rax - jmp L(align16_start) -L(next): - mov %rdi, %rax - and $-16, %rax - pcmpeqb (%rax), %xmm0 - mov $-1, %r10d - sub %rax, %rcx - shl %cl, %r10d - pmovmskb %xmm0, %edx - and %r10d, %edx - jnz L(exit) +# define RETURN jmp L(StartStrcpyPart) +# include "strlen-sse2-pminub.S" +# undef RETURN -L(align16_start): - pxor %xmm0, %xmm0 - pxor %xmm1, %xmm1 - pxor %xmm2, %xmm2 - pxor %xmm3, %xmm3 - pcmpeqb 16(%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit16) - - pcmpeqb 32(%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit32) - - pcmpeqb 48(%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit48) - - pcmpeqb 64(%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - jnz L(exit64) - - pcmpeqb 80(%rax), %xmm0 - add $64, %rax - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit16) - - pcmpeqb 32(%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit32) - - pcmpeqb 48(%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit48) - - pcmpeqb 64(%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - jnz L(exit64) - - pcmpeqb 80(%rax), %xmm0 - add $64, %rax - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit16) - - pcmpeqb 32(%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit32) - - pcmpeqb 48(%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit48) - - pcmpeqb 64(%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - jnz L(exit64) - - pcmpeqb 80(%rax), %xmm0 - add $64, %rax - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit16) - - pcmpeqb 32(%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit32) - - pcmpeqb 48(%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit48) - - pcmpeqb 64(%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - jnz L(exit64) - - test $0x3f, %rax - jz L(align64_loop) - - pcmpeqb 80(%rax), %xmm0 - add $80, %rax - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit) - - test $0x3f, %rax - jz L(align64_loop) - - pcmpeqb 16(%rax), %xmm1 - add $16, %rax - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit) - - test $0x3f, %rax - jz L(align64_loop) - - pcmpeqb 16(%rax), %xmm2 - add $16, %rax - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit) - - test $0x3f, %rax - jz L(align64_loop) - - pcmpeqb 16(%rax), %xmm3 - add $16, %rax - pmovmskb %xmm3, %edx - test %edx, %edx - jnz L(exit) - - add $16, %rax - .p2align 4 - L(align64_loop): - movaps (%rax), %xmm4 - pminub 16(%rax), %xmm4 - movaps 32(%rax), %xmm5 - pminub 48(%rax), %xmm5 - add $64, %rax - pminub %xmm4, %xmm5 - pcmpeqb %xmm0, %xmm5 - pmovmskb %xmm5, %edx - test %edx, %edx - jz L(align64_loop) - - pcmpeqb -64(%rax), %xmm0 - sub $80, %rax - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit16) - - pcmpeqb 32(%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit32) - - pcmpeqb 48(%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit48) - - pcmpeqb 64(%rax), %xmm3 - pmovmskb %xmm3, %edx - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $64, %rax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit): - sub %rdi, %rax -L(exit_less16): - bsf %rdx, %rdx - add %rdx, %rax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit16): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $16, %rax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit32): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $32, %rax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit48): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $48, %rax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit64): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $64, %rax - - .p2align 4 L(StartStrcpyPart): lea (%r9, %rax), %rdi mov %rsi, %rcx diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S index 901e66f2c8..fea9d11b40 100644 --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S +++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S @@ -33,317 +33,11 @@ ENTRY (STRCAT) mov %rdx, %r8 # endif - xor %eax, %eax - cmpb $0, (%rdi) - jz L(exit_tail0) - cmpb $0, 1(%rdi) - jz L(exit_tail1) - cmpb $0, 2(%rdi) - jz L(exit_tail2) - cmpb $0, 3(%rdi) - jz L(exit_tail3) - - cmpb $0, 4(%rdi) - jz L(exit_tail4) - cmpb $0, 5(%rdi) - jz L(exit_tail5) - cmpb $0, 6(%rdi) - jz L(exit_tail6) - cmpb $0, 7(%rdi) - jz L(exit_tail7) - - cmpb $0, 8(%rdi) - jz L(exit_tail8) - cmpb $0, 9(%rdi) - jz L(exit_tail9) - cmpb $0, 10(%rdi) - jz L(exit_tail10) - cmpb $0, 11(%rdi) - jz L(exit_tail11) - - cmpb $0, 12(%rdi) - jz L(exit_tail12) - cmpb $0, 13(%rdi) - jz L(exit_tail13) - cmpb $0, 14(%rdi) - jz L(exit_tail14) - cmpb $0, 15(%rdi) - jz L(exit_tail15) - pxor %xmm0, %xmm0 - lea 16(%rdi), %rcx - lea 16(%rdi), %rax - and $-16, %rax - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - pxor %xmm1, %xmm1 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - pxor %xmm2, %xmm2 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - pxor %xmm3, %xmm3 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - and $-0x40, %rax +# define RETURN jmp L(StartStrcpyPart) +# include "strlen-sse2-no-bsf.S" - .p2align 4 -L(aligned_64): - pcmpeqb (%rax), %xmm0 - pcmpeqb 16(%rax), %xmm1 - pcmpeqb 32(%rax), %xmm2 - pcmpeqb 48(%rax), %xmm3 - pmovmskb %xmm0, %edx - pmovmskb %xmm1, %r11d - pmovmskb %xmm2, %r10d - pmovmskb %xmm3, %r9d - or %edx, %r9d - or %r11d, %r9d - or %r10d, %r9d - lea 64(%rax), %rax - jz L(aligned_64) - - test %edx, %edx - jnz L(aligned_64_exit_16) - test %r11d, %r11d - jnz L(aligned_64_exit_32) - test %r10d, %r10d - jnz L(aligned_64_exit_48) - -L(aligned_64_exit_64): - pmovmskb %xmm3, %edx - jmp L(exit) - -L(aligned_64_exit_48): - lea -16(%rax), %rax - mov %r10d, %edx - jmp L(exit) - -L(aligned_64_exit_32): - lea -32(%rax), %rax - mov %r11d, %edx - jmp L(exit) - -L(aligned_64_exit_16): - lea -48(%rax), %rax - -L(exit): - sub %rcx, %rax - test %dl, %dl - jz L(exit_high) - test $0x01, %dl - jnz L(exit_tail0) - - test $0x02, %dl - jnz L(exit_tail1) - - test $0x04, %dl - jnz L(exit_tail2) - - test $0x08, %dl - jnz L(exit_tail3) - - test $0x10, %dl - jnz L(exit_tail4) - - test $0x20, %dl - jnz L(exit_tail5) - - test $0x40, %dl - jnz L(exit_tail6) - add $7, %eax -L(exit_tail0): - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_high): - add $8, %eax - test $0x01, %dh - jnz L(exit_tail0) - - test $0x02, %dh - jnz L(exit_tail1) - - test $0x04, %dh - jnz L(exit_tail2) - - test $0x08, %dh - jnz L(exit_tail3) - - test $0x10, %dh - jnz L(exit_tail4) - - test $0x20, %dh - jnz L(exit_tail5) - - test $0x40, %dh - jnz L(exit_tail6) - add $7, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail1): - add $1, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail2): - add $2, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail3): - add $3, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail4): - add $4, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail5): - add $5, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail6): - add $6, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail7): - add $7, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail8): - add $8, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail9): - add $9, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail10): - add $10, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail11): - add $11, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail12): - add $12, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail13): - add $13, %eax - jmp L(StartStrcpyPart) +# undef RETURN - .p2align 4 -L(exit_tail14): - add $14, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail15): - add $15, %eax - - .p2align 4 L(StartStrcpyPart): mov %rsi, %rcx lea (%rdi, %rax), %rdx diff --git a/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S new file mode 100644 index 0000000000..ff2ab70044 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S @@ -0,0 +1,685 @@ +/* strlen SSE2 without bsf + Copyright (C) 2010-2013 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* only for strlen case we don't use optimized version for STATIC build just for SHARED */ + +#if (defined SHARED || defined USE_AS_STRCAT || defined USE_AS_STRNLEN) && !defined NOT_IN_libc + +# ifndef USE_AS_STRCAT + +# include <sysdep.h> + +# define RETURN ret + +# ifndef STRLEN +# define STRLEN __strlen_sse2_no_bsf +# endif + + atom_text_section +ENTRY (STRLEN) +# endif + xor %eax, %eax +# ifdef USE_AS_STRNLEN + mov %rsi, %r8 + sub $4, %rsi + jbe L(len_less4_prolog) +# endif + cmpb $0, (%rdi) + jz L(exit_tail0) + cmpb $0, 1(%rdi) + jz L(exit_tail1) + cmpb $0, 2(%rdi) + jz L(exit_tail2) + cmpb $0, 3(%rdi) + jz L(exit_tail3) + +# ifdef USE_AS_STRNLEN + sub $4, %rsi + jbe L(len_less8_prolog) +# endif + + cmpb $0, 4(%rdi) + jz L(exit_tail4) + cmpb $0, 5(%rdi) + jz L(exit_tail5) + cmpb $0, 6(%rdi) + jz L(exit_tail6) + cmpb $0, 7(%rdi) + jz L(exit_tail7) + +# ifdef USE_AS_STRNLEN + sub $4, %rsi + jbe L(len_less12_prolog) +# endif + + cmpb $0, 8(%rdi) + jz L(exit_tail8) + cmpb $0, 9(%rdi) + jz L(exit_tail9) + cmpb $0, 10(%rdi) + jz L(exit_tail10) + cmpb $0, 11(%rdi) + jz L(exit_tail11) + +# ifdef USE_AS_STRNLEN + sub $4, %rsi + jbe L(len_less16_prolog) +# endif + + cmpb $0, 12(%rdi) + jz L(exit_tail12) + cmpb $0, 13(%rdi) + jz L(exit_tail13) + cmpb $0, 14(%rdi) + jz L(exit_tail14) + cmpb $0, 15(%rdi) + jz L(exit_tail15) + pxor %xmm0, %xmm0 + lea 16(%rdi), %rcx + lea 16(%rdi), %rax + and $-16, %rax + +# ifdef USE_AS_STRNLEN + and $15, %rdi + add %rdi, %rsi + sub $64, %rsi + jbe L(len_less64) +# endif + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + sub $64, %rsi + jbe L(len_less64) +# endif + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + sub $64, %rsi + jbe L(len_less64) +# endif + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + sub $64, %rsi + jbe L(len_less64) +# endif + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + mov %rax, %rdx + and $63, %rdx + add %rdx, %rsi +# endif + + and $-0x40, %rax + + .p2align 4 +L(aligned_64): +# ifdef USE_AS_STRNLEN + sub $64, %rsi + jbe L(len_less64) +# endif + pcmpeqb (%rax), %xmm0 + pcmpeqb 16(%rax), %xmm1 + pcmpeqb 32(%rax), %xmm2 + pcmpeqb 48(%rax), %xmm3 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %r11d + pmovmskb %xmm2, %r10d + pmovmskb %xmm3, %r9d + or %edx, %r9d + or %r11d, %r9d + or %r10d, %r9d + lea 64(%rax), %rax + jz L(aligned_64) + + test %edx, %edx + jnz L(aligned_64_exit_16) + test %r11d, %r11d + jnz L(aligned_64_exit_32) + test %r10d, %r10d + jnz L(aligned_64_exit_48) +L(aligned_64_exit_64): + pmovmskb %xmm3, %edx + jmp L(aligned_64_exit) +L(aligned_64_exit_48): + lea -16(%rax), %rax + mov %r10d, %edx + jmp L(aligned_64_exit) +L(aligned_64_exit_32): + lea -32(%rax), %rax + mov %r11d, %edx + jmp L(aligned_64_exit) +L(aligned_64_exit_16): + lea -48(%rax), %rax +L(aligned_64_exit): +L(exit): + sub %rcx, %rax + test %dl, %dl + jz L(exit_high) + test $0x01, %dl + jnz L(exit_tail0) + + test $0x02, %dl + jnz L(exit_tail1) + + test $0x04, %dl + jnz L(exit_tail2) + + test $0x08, %dl + jnz L(exit_tail3) + + test $0x10, %dl + jnz L(exit_tail4) + + test $0x20, %dl + jnz L(exit_tail5) + + test $0x40, %dl + jnz L(exit_tail6) + add $7, %eax +L(exit_tail0): + RETURN + +L(exit_high): + add $8, %eax + test $0x01, %dh + jnz L(exit_tail0) + + test $0x02, %dh + jnz L(exit_tail1) + + test $0x04, %dh + jnz L(exit_tail2) + + test $0x08, %dh + jnz L(exit_tail3) + + test $0x10, %dh + jnz L(exit_tail4) + + test $0x20, %dh + jnz L(exit_tail5) + + test $0x40, %dh + jnz L(exit_tail6) + add $7, %eax + RETURN + +# ifdef USE_AS_STRNLEN + + .p2align 4 +L(len_less64): + pxor %xmm0, %xmm0 + add $64, %rsi + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + lea 16(%rax), %rax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %rsi + jbe L(return_start_len) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + lea 16(%rax), %rax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %rsi + jbe L(return_start_len) + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + lea 16(%rax), %rax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %rsi + jbe L(return_start_len) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + lea 16(%rax), %rax + test %edx, %edx + jnz L(strnlen_exit) + + mov %r8, %rax + ret + + .p2align 4 +L(strnlen_exit): + sub %rcx, %rax + + test %dl, %dl + jz L(strnlen_exit_high) + mov %dl, %cl + and $15, %cl + jz L(strnlen_exit_8) + test $0x01, %dl + jnz L(exit_tail0) + test $0x02, %dl + jnz L(strnlen_exit_tail1) + test $0x04, %dl + jnz L(strnlen_exit_tail2) + sub $4, %rsi + jb L(return_start_len) + lea 3(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_8): + test $0x10, %dl + jnz L(strnlen_exit_tail4) + test $0x20, %dl + jnz L(strnlen_exit_tail5) + test $0x40, %dl + jnz L(strnlen_exit_tail6) + sub $8, %rsi + jb L(return_start_len) + lea 7(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_high): + mov %dh, %ch + and $15, %ch + jz L(strnlen_exit_high_8) + test $0x01, %dh + jnz L(strnlen_exit_tail8) + test $0x02, %dh + jnz L(strnlen_exit_tail9) + test $0x04, %dh + jnz L(strnlen_exit_tail10) + sub $12, %rsi + jb L(return_start_len) + lea 11(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_high_8): + test $0x10, %dh + jnz L(strnlen_exit_tail12) + test $0x20, %dh + jnz L(strnlen_exit_tail13) + test $0x40, %dh + jnz L(strnlen_exit_tail14) + sub $16, %rsi + jb L(return_start_len) + lea 15(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail1): + sub $2, %rsi + jb L(return_start_len) + lea 1(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail2): + sub $3, %rsi + jb L(return_start_len) + lea 2(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail4): + sub $5, %rsi + jb L(return_start_len) + lea 4(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail5): + sub $6, %rsi + jb L(return_start_len) + lea 5(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail6): + sub $7, %rsi + jb L(return_start_len) + lea 6(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail8): + sub $9, %rsi + jb L(return_start_len) + lea 8(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail9): + sub $10, %rsi + jb L(return_start_len) + lea 9(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail10): + sub $11, %rsi + jb L(return_start_len) + lea 10(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail12): + sub $13, %rsi + jb L(return_start_len) + lea 12(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail13): + sub $14, %rsi + jb L(return_start_len) + lea 13(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail14): + sub $15, %rsi + jb L(return_start_len) + lea 14(%eax), %eax + ret + + .p2align 4 +L(return_start_len): + mov %r8, %rax + ret + +/* for prolog only */ + + .p2align 4 +L(len_less4_prolog): + add $4, %rsi + jz L(exit_tail0) + + cmpb $0, (%rdi) + jz L(exit_tail0) + cmp $1, %esi + je L(exit_tail1) + + cmpb $0, 1(%rdi) + jz L(exit_tail1) + cmp $2, %esi + je L(exit_tail2) + + cmpb $0, 2(%rdi) + jz L(exit_tail2) + cmp $3, %esi + je L(exit_tail3) + + cmpb $0, 3(%rdi) + jz L(exit_tail3) + mov $4, %eax + ret + + .p2align 4 +L(len_less8_prolog): + add $4, %rsi + + cmpb $0, 4(%rdi) + jz L(exit_tail4) + cmp $1, %esi + je L(exit_tail5) + + cmpb $0, 5(%rdi) + jz L(exit_tail5) + cmp $2, %esi + je L(exit_tail6) + + cmpb $0, 6(%rdi) + jz L(exit_tail6) + cmp $3, %esi + je L(exit_tail7) + + cmpb $0, 7(%rdi) + jz L(exit_tail7) + mov $8, %eax + ret + + .p2align 4 +L(len_less12_prolog): + add $4, %rsi + + cmpb $0, 8(%rdi) + jz L(exit_tail8) + cmp $1, %esi + je L(exit_tail9) + + cmpb $0, 9(%rdi) + jz L(exit_tail9) + cmp $2, %esi + je L(exit_tail10) + + cmpb $0, 10(%rdi) + jz L(exit_tail10) + cmp $3, %esi + je L(exit_tail11) + + cmpb $0, 11(%rdi) + jz L(exit_tail11) + mov $12, %eax + ret + + .p2align 4 +L(len_less16_prolog): + add $4, %rsi + + cmpb $0, 12(%rdi) + jz L(exit_tail12) + cmp $1, %esi + je L(exit_tail13) + + cmpb $0, 13(%rdi) + jz L(exit_tail13) + cmp $2, %esi + je L(exit_tail14) + + cmpb $0, 14(%rdi) + jz L(exit_tail14) + cmp $3, %esi + je L(exit_tail15) + + cmpb $0, 15(%rdi) + jz L(exit_tail15) + mov $16, %eax + ret +# endif + + .p2align 4 +L(exit_tail1): + add $1, %eax + RETURN + + .p2align 4 +L(exit_tail2): + add $2, %eax + RETURN + + .p2align 4 +L(exit_tail3): + add $3, %eax + RETURN + + .p2align 4 +L(exit_tail4): + add $4, %eax + RETURN + + .p2align 4 +L(exit_tail5): + add $5, %eax + RETURN + + .p2align 4 +L(exit_tail6): + add $6, %eax + RETURN + + .p2align 4 +L(exit_tail7): + add $7, %eax + RETURN + + .p2align 4 +L(exit_tail8): + add $8, %eax + RETURN + + .p2align 4 +L(exit_tail9): + add $9, %eax + RETURN + + .p2align 4 +L(exit_tail10): + add $10, %eax + RETURN + + .p2align 4 +L(exit_tail11): + add $11, %eax + RETURN + + .p2align 4 +L(exit_tail12): + add $12, %eax + RETURN + + .p2align 4 +L(exit_tail13): + add $13, %eax + RETURN + + .p2align 4 +L(exit_tail14): + add $14, %eax + RETURN + + .p2align 4 +L(exit_tail15): + add $15, %eax +# ifndef USE_AS_STRCAT + RETURN +END (STRLEN) +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S b/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S new file mode 100644 index 0000000000..cc4bb57e97 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S @@ -0,0 +1,259 @@ +/* strlen SSE2 + Copyright (C) 2011-2013 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if !defined NOT_IN_libc && (defined SHARED || defined USE_AS_STRCAT) + +# ifndef USE_AS_STRCAT + +# include <sysdep.h> + +# define RETURN ret + + .section .text.sse2,"ax",@progbits +ENTRY (__strlen_sse2_pminub) + +# endif + xor %rax, %rax + mov %edi, %ecx + and $0x3f, %ecx + pxor %xmm0, %xmm0 + cmp $0x30, %ecx + ja L(next) + movdqu (%rdi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit_less16) + mov %rdi, %rax + and $-16, %rax + jmp L(align16_start) +L(next): + mov %rdi, %rax + and $-16, %rax + pcmpeqb (%rax), %xmm0 + mov $-1, %r10d + sub %rax, %rcx + shl %cl, %r10d + pmovmskb %xmm0, %edx + and %r10d, %edx + jnz L(exit) +L(align16_start): + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pcmpeqb 16(%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + pcmpeqb 80(%rax), %xmm0 + add $64, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + pcmpeqb 80(%rax), %xmm0 + add $64, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + pcmpeqb 80(%rax), %xmm0 + add $64, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 80(%rax), %xmm0 + add $80, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 16(%rax), %xmm1 + add $16, %rax + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 16(%rax), %xmm2 + add $16, %rax + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 16(%rax), %xmm3 + add $16, %rax + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit) + + add $16, %rax + .p2align 4 + L(align64_loop): + movaps (%rax), %xmm4 + pminub 16(%rax), %xmm4 + movaps 32(%rax), %xmm5 + pminub 48(%rax), %xmm5 + add $64, %rax + pminub %xmm4, %xmm5 + pcmpeqb %xmm0, %xmm5 + pmovmskb %xmm5, %edx + test %edx, %edx + jz L(align64_loop) + + + pcmpeqb -64(%rax), %xmm0 + sub $80, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $64, %rax + RETURN + + .p2align 4 +L(exit): + sub %rdi, %rax +L(exit_less16): + bsf %rdx, %rdx + add %rdx, %rax + RETURN + .p2align 4 +L(exit16): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $16, %rax + RETURN + .p2align 4 +L(exit32): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $32, %rax + RETURN + .p2align 4 +L(exit48): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $48, %rax + RETURN + .p2align 4 +L(exit64): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $64, %rax +# ifndef USE_AS_STRCAT + RETURN + +END (__strlen_sse2_pminub) +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strlen-sse4.S b/sysdeps/x86_64/multiarch/strlen-sse4.S new file mode 100644 index 0000000000..8d685df0cf --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-sse4.S @@ -0,0 +1,84 @@ +/* strlen with SSE4 + Copyright (C) 2009-2013 Free Software Foundation, Inc. + Contributed by Ulrich Drepper <drepper@redhat.com>. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if defined SHARED && !defined NOT_IN_libc + +#include <sysdep.h> + + .section .text.sse4.2,"ax",@progbits +ENTRY (__strlen_sse42) + pxor %xmm1, %xmm1 + movl %edi, %ecx + movq %rdi, %r8 + andq $~15, %rdi + xor %edi, %ecx + pcmpeqb (%rdi), %xmm1 + pmovmskb %xmm1, %edx + shrl %cl, %edx + shll %cl, %edx + andl %edx, %edx + jnz L(less16bytes) + pxor %xmm1, %xmm1 + + .p2align 4 +L(more64bytes_loop): + pcmpistri $0x08, 16(%rdi), %xmm1 + jz L(more32bytes) + + pcmpistri $0x08, 32(%rdi), %xmm1 + jz L(more48bytes) + + pcmpistri $0x08, 48(%rdi), %xmm1 + jz L(more64bytes) + + add $64, %rdi + pcmpistri $0x08, (%rdi), %xmm1 + jnz L(more64bytes_loop) + leaq (%rdi,%rcx), %rax + subq %r8, %rax + ret + + .p2align 4 +L(more32bytes): + leaq 16(%rdi,%rcx, 1), %rax + subq %r8, %rax + ret + + .p2align 4 +L(more48bytes): + leaq 32(%rdi,%rcx, 1), %rax + subq %r8, %rax + ret + + .p2align 4 +L(more64bytes): + leaq 48(%rdi,%rcx, 1), %rax + subq %r8, %rax + ret + + .p2align 4 +L(less16bytes): + subq %r8, %rdi + bsfl %edx, %eax + addq %rdi, %rax + ret + +END (__strlen_sse42) + +#endif diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S new file mode 100644 index 0000000000..ab29ceff21 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen.S @@ -0,0 +1,68 @@ +/* Multiple versions of strlen(str) -- determine the length of the string STR. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2009-2013 Free Software Foundation, Inc. + Contributed by Ulrich Drepper <drepper@redhat.com>. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + +/* Define multiple versions only for the definition in libc and for + the DSO. In static binaries we need strlen before the initialization + happened. */ +#if defined SHARED && !defined NOT_IN_libc + .text +ENTRY(strlen) + .type strlen, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq __strlen_sse2_pminub(%rip), %rax + testl $bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip) + jnz 2f + leaq __strlen_sse2(%rip), %rax + testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) + jz 2f + leaq __strlen_sse42(%rip), %rax + ret +2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip) + jz 3f + leaq __strlen_sse2_no_bsf(%rip), %rax +3: ret +END(strlen) + +# undef ENTRY +# define ENTRY(name) \ + .type __strlen_sse2, @function; \ + .align 16; \ + .globl __strlen_sse2; \ + .hidden __strlen_sse2; \ + __strlen_sse2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strlen_sse2, .-__strlen_sse2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strlen calls through a PLT. + The speedup we get from using SSE4.2 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_strlen; __GI_strlen = __strlen_sse2 +#endif + +#include "../strlen.S" diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S new file mode 100644 index 0000000000..248328d999 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNLEN +#define STRLEN __strnlen_sse2_no_bsf +#include "strlen-sse2-no-bsf.S" diff --git a/sysdeps/x86_64/multiarch/strnlen.S b/sysdeps/x86_64/multiarch/strnlen.S new file mode 100644 index 0000000000..124f8458a3 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strnlen.S @@ -0,0 +1,57 @@ +/* multiple version of strnlen + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2013 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + + .text +ENTRY(__strnlen) + .type __strnlen, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq __strnlen_sse2(%rip), %rax + testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip) + jz 2f + leaq __strnlen_sse2_no_bsf(%rip), %rax +2: ret +END(__strnlen) + +# undef ENTRY +# define ENTRY(name) \ + .type __strnlen_sse2, @function; \ + .align 16; \ + .globl __strnlen_sse2; \ + .hidden __strnlen_sse2; \ + __strnlen_sse2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strnlen_sse2, .-__strnlen_sse2 + +# undef libc_hidden_def +# define libc_hidden_def(name) \ + .globl __GI_strnlen; __GI_strnlen = __strnlen_sse2 +#endif + +#include "../strnlen.S" diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S index e82fe8d039..4bdca0a452 100644 --- a/sysdeps/x86_64/strlen.S +++ b/sysdeps/x86_64/strlen.S @@ -1,5 +1,6 @@ -/* SSE2 version of strlen. - Copyright (C) 2012, 2013 Free Software Foundation, Inc. +/* strlen(str) -- determine the length of the string STR. + Copyright (C) 2009-2013 Free Software Foundation, Inc. + Contributed by Ulrich Drepper <drepper@redhat.com>. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,217 +19,83 @@ #include <sysdep.h> -/* Long lived register are - strlen(s), strnlen(s, n): - %xmm11 - zero - %rdi - s - %r10 (s+n) & (~(64-1)) - %r11 s+n -*/ - - -.text + .text ENTRY(strlen) - -#define FIND_ZERO \ - pcmpeqb (%rax), %xmm8; \ - pcmpeqb 16(%rax), %xmm9; \ - pcmpeqb 32(%rax), %xmm10; \ - pcmpeqb 48(%rax), %xmm11; \ - pmovmskb %xmm8, %esi; \ - pmovmskb %xmm9, %edx; \ - pmovmskb %xmm10, %r8d; \ - pmovmskb %xmm11, %ecx; \ - salq $16, %rdx; \ - salq $16, %rcx; \ - orq %rsi, %rdx; \ - orq %r8, %rcx; \ - salq $32, %rcx; \ - orq %rcx, %rdx; - -#ifdef AS_STRNLEN -/* Do not read anything when n==0. */ - test %rsi, %rsi - jne L(n_nonzero) xor %rax, %rax - ret -L(n_nonzero): - -/* Initialize long lived registers. */ - - add %rdi, %rsi - mov %rsi, %r10 - and $-64, %r10 - mov %rsi, %r11 -#endif - - pxor %xmm8, %xmm8 - pxor %xmm9, %xmm9 - pxor %xmm10, %xmm10 - pxor %xmm11, %xmm11 - movq %rdi, %rax - movq %rdi, %rcx - andq $4095, %rcx -/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ - cmpq $4047, %rcx -/* We cannot unify this branching as it would be ~6 cycles slower. */ + mov %edi, %ecx + and $0x3f, %ecx + pxor %xmm0, %xmm0 + cmp $0x30, %ecx ja L(next) - -#ifdef AS_STRNLEN -# define STRNLEN_PROLOG \ - mov %r11, %rsi; \ - subq %rax, %rsi; \ - andq $-64, %rax; \ - testq $-64, %rsi; \ - je L(strnlen_ret) -#else -# define STRNLEN_PROLOG andq $-64, %rax; -#endif - -#define PROLOG(lab) \ - movq %rdi, %rcx; \ - xorq %rax, %rcx; \ - STRNLEN_PROLOG; \ - sarq %cl, %rdx; \ - test %rdx, %rdx; \ - je L(lab); \ - bsfq %rdx, %rax; \ - ret - -#ifdef AS_STRNLEN - andq $-16, %rax - FIND_ZERO -#else - movdqu (%rax), %xmm12 - pcmpeqb %xmm8, %xmm12 - pmovmskb %xmm12, %edx + movdqu (%rdi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx test %edx, %edx - je L(next48_bytes) - bsfq %rdx, %rax - ret - -L(next48_bytes): -/* Same as FIND_ZERO except we do not check first 16 bytes. */ - andq $-16, %rax - pcmpeqb 16(%rax), %xmm9; - pcmpeqb 32(%rax), %xmm10; - pcmpeqb 48(%rax), %xmm11; - pmovmskb %xmm9, %edx; - pmovmskb %xmm10, %r8d; - pmovmskb %xmm11, %ecx; - salq $16, %rdx; - salq $16, %rcx; - orq %r8, %rcx; - salq $32, %rcx; - orq %rcx, %rdx; -#endif - - PROLOG(loop) - - .p2align 4 + jnz L(exit_less16) + mov %rdi, %rax + and $-16, %rax + jmp L(align16_start) L(next): - andq $-64, %rax - FIND_ZERO - PROLOG(loop_init) - -#ifdef AS_STRNLEN -/* We must do this check to correctly handle strnlen (s, -1). */ -L(strnlen_ret): - bts %rsi, %rdx - sarq %cl, %rdx - test %rdx, %rdx - je L(loop_init) - bsfq %rdx, %rax - ret -#endif - .p2align 4 -L(loop_init): - pxor %xmm9, %xmm9 - pxor %xmm10, %xmm10 - pxor %xmm11, %xmm11 -#ifdef AS_STRNLEN + mov %rdi, %rax + and $-16, %rax + pcmpeqb (%rax), %xmm0 + mov $-1, %esi + sub %rax, %rcx + shl %cl, %esi + pmovmskb %xmm0, %edx + and %esi, %edx + jnz L(exit) +L(align16_start): + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 .p2align 4 -L(loop): - - addq $64, %rax - cmpq %rax, %r10 - je L(exit_end) +L(align16_loop): + pcmpeqb 16(%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) - movdqa (%rax), %xmm8 - pminub 16(%rax), %xmm8 - pminub 32(%rax), %xmm8 - pminub 48(%rax), %xmm8 - pcmpeqb %xmm11, %xmm8 - pmovmskb %xmm8, %edx - testl %edx, %edx - jne L(exit) - jmp L(loop) + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) - .p2align 4 -L(exit_end): - cmp %rax, %r11 - je L(first) - pxor %xmm8, %xmm8 - FIND_ZERO + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) -L(first): - bts %r11, %rdx - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + lea 64(%rax), %rax + test %edx, %edx + jz L(align16_loop) +L(exit): + sub %rdi, %rax +L(exit_less16): + bsf %rdx, %rdx + add %rdx, %rax ret - .p2align 4 -L(exit): - pxor %xmm8, %xmm8 - FIND_ZERO - - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax +L(exit16): + sub %rdi, %rax + bsf %rdx, %rdx + lea 16(%rdx,%rax), %rax ret - -#else .p2align 4 -L(loop): - - movdqa 64(%rax), %xmm8 - pminub 80(%rax), %xmm8 - pminub 96(%rax), %xmm8 - pminub 112(%rax), %xmm8 - pcmpeqb %xmm11, %xmm8 - pmovmskb %xmm8, %edx - testl %edx, %edx - jne L(exit64) - - subq $-128, %rax - - movdqa (%rax), %xmm8 - pminub 16(%rax), %xmm8 - pminub 32(%rax), %xmm8 - pminub 48(%rax), %xmm8 - pcmpeqb %xmm11, %xmm8 - pmovmskb %xmm8, %edx - testl %edx, %edx - jne L(exit0) - jmp L(loop) - +L(exit32): + sub %rdi, %rax + bsf %rdx, %rdx + lea 32(%rdx,%rax), %rax + ret .p2align 4 -L(exit64): - addq $64, %rax -L(exit0): - pxor %xmm8, %xmm8 - FIND_ZERO - - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax +L(exit48): + sub %rdi, %rax + bsf %rdx, %rdx + lea 48(%rdx,%rax), %rax ret - -#endif - END(strlen) -#ifndef AS_STRLEN libc_hidden_builtin_def (strlen) -#endif diff --git a/sysdeps/x86_64/strnlen.S b/sysdeps/x86_64/strnlen.S index d3c43ac482..6e53503060 100644 --- a/sysdeps/x86_64/strnlen.S +++ b/sysdeps/x86_64/strnlen.S @@ -1,6 +1,63 @@ -#define AS_STRNLEN -#define strlen __strnlen -#include "strlen.S" +/* strnlen(str,maxlen) -- determine the length of the string STR up to MAXLEN. + Copyright (C) 2010-2013 Free Software Foundation, Inc. + Contributed by Ulrich Drepper <drepper@redhat.com>. + This file is part of the GNU C Library. -weak_alias (__strnlen, strnlen); -libc_hidden_builtin_def (strnlen) + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + + .text +ENTRY(__strnlen) + movq %rsi, %rax + testq %rsi, %rsi + jz 3f + pxor %xmm2, %xmm2 + movq %rdi, %rcx + movq %rdi, %r8 + movq $16, %r9 + andq $~15, %rdi + movdqa %xmm2, %xmm1 + pcmpeqb (%rdi), %xmm2 + orl $0xffffffff, %r10d + subq %rdi, %rcx + shll %cl, %r10d + subq %rcx, %r9 + pmovmskb %xmm2, %edx + andl %r10d, %edx + jnz 1f + subq %r9, %rsi + jbe 3f + +2: movdqa 16(%rdi), %xmm0 + leaq 16(%rdi), %rdi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + testl %edx, %edx + jnz 1f + subq $16, %rsi + jnbe 2b +3: ret + +1: subq %r8, %rdi + bsfl %edx, %edx + addq %rdi, %rdx + cmpq %rdx, %rax + cmovnbq %rdx, %rax + ret +END(__strnlen) +weak_alias (__strnlen, strnlen) +libc_hidden_def (strnlen) |