diff options
Diffstat (limited to 'sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S')
-rw-r--r-- | sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S | 210 |
1 files changed, 210 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S new file mode 100644 index 0000000000..eed8432973 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S @@ -0,0 +1,210 @@ +/* strcmp with unaligned loads + Copyright (C) 2013 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#define ALIGN(x) .p2align x + +ENTRY ( __strcmp_sse2_unaligned) + movl %edi, %eax + xorl %edx, %edx + pxor %xmm7, %xmm7 + orl %esi, %eax + andl $4095, %eax + cmpl $4032, %eax + jg L(cross_page) + movdqu (%rdi), %xmm1 + movdqu (%rsi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pminub %xmm1, %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + testq %rax, %rax + je L(next_48_bytes) +L(return): + bsfq %rax, %rdx + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax + ret + + ALIGN (4) +L(next_48_bytes): + movdqu 16(%rdi), %xmm6 + movdqu 16(%rsi), %xmm3 + movdqu 32(%rdi), %xmm5 + pcmpeqb %xmm6, %xmm3 + movdqu 32(%rsi), %xmm2 + pminub %xmm6, %xmm3 + pcmpeqb %xmm1, %xmm3 + movdqu 48(%rdi), %xmm4 + pcmpeqb %xmm5, %xmm2 + pmovmskb %xmm3, %edx + movdqu 48(%rsi), %xmm0 + pminub %xmm5, %xmm2 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm2, %eax + salq $16, %rdx + pminub %xmm4, %xmm0 + pcmpeqb %xmm1, %xmm0 + salq $32, %rax + orq %rdx, %rax + pmovmskb %xmm0, %ecx + movq %rcx, %rdx + salq $48, %rdx + orq %rdx, %rax + jne L(return) +L(main_loop_header): + leaq 64(%rdi), %rdx + movl $4096, %ecx + pxor %xmm9, %xmm9 + andq $-64, %rdx + subq %rdi, %rdx + leaq (%rdi, %rdx), %rax + addq %rsi, %rdx + movq %rdx, %rsi + andl $4095, %esi + subq %rsi, %rcx + shrq $6, %rcx + movq %rcx, %rsi + jmp L(loop_start) + + ALIGN (4) +L(loop): + addq $64, %rax + addq $64, %rdx +L(loop_start): + testq %rsi, %rsi + leaq -1(%rsi), %rsi + je L(loop_cross_page) +L(back_to_loop): + movdqu (%rdx), %xmm0 + movdqu 16(%rdx), %xmm1 + movdqa (%rax), %xmm2 + movdqa 16(%rax), %xmm3 + pcmpeqb %xmm2, %xmm0 + movdqu 32(%rdx), %xmm5 + pcmpeqb %xmm3, %xmm1 + pminub %xmm2, %xmm0 + movdqu 48(%rdx), %xmm6 + pminub %xmm3, %xmm1 + movdqa 32(%rax), %xmm2 + pminub %xmm1, %xmm0 + movdqa 48(%rax), %xmm3 + pcmpeqb %xmm2, %xmm5 + pcmpeqb %xmm3, %xmm6 + pminub %xmm2, %xmm5 + pminub %xmm3, %xmm6 + pminub %xmm5, %xmm0 + pminub %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %ecx + testl %ecx, %ecx + je L(loop) + pcmpeqb %xmm7, %xmm5 + movdqu (%rdx), %xmm0 + pcmpeqb %xmm7, %xmm1 + movdqa (%rax), %xmm2 + pcmpeqb %xmm2, %xmm0 + pminub %xmm2, %xmm0 + pcmpeqb %xmm7, %xmm6 + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm1, %ecx + pmovmskb %xmm5, %r8d + pmovmskb %xmm0, %edi + salq $16, %rcx + salq $32, %r8 + pmovmskb %xmm6, %esi + orq %r8, %rcx + orq %rdi, %rcx + salq $48, %rsi + orq %rsi, %rcx + bsfq %rcx, %rcx + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax + ret + + ALIGN (4) +L(loop_cross_page): + xor %r10, %r10 + movq %rdx, %r9 + and $63, %r9 + subq %r9, %r10 + + movdqa (%rdx, %r10), %xmm0 + movdqa 16(%rdx, %r10), %xmm1 + movdqu (%rax, %r10), %xmm2 + movdqu 16(%rax, %r10), %xmm3 + pcmpeqb %xmm2, %xmm0 + movdqa 32(%rdx, %r10), %xmm5 + pcmpeqb %xmm3, %xmm1 + pminub %xmm2, %xmm0 + movdqa 48(%rdx, %r10), %xmm6 + pminub %xmm3, %xmm1 + movdqu 32(%rax, %r10), %xmm2 + movdqu 48(%rax, %r10), %xmm3 + pcmpeqb %xmm2, %xmm5 + pcmpeqb %xmm3, %xmm6 + pminub %xmm2, %xmm5 + pminub %xmm3, %xmm6 + + pcmpeqb %xmm7, %xmm0 + pcmpeqb %xmm7, %xmm1 + pcmpeqb %xmm7, %xmm5 + pcmpeqb %xmm7, %xmm6 + + pmovmskb %xmm1, %ecx + pmovmskb %xmm5, %r8d + pmovmskb %xmm0, %edi + salq $16, %rcx + salq $32, %r8 + pmovmskb %xmm6, %esi + orq %r8, %rdi + orq %rcx, %rdi + salq $48, %rsi + orq %rsi, %rdi + movq %r9, %rcx + movq $63, %rsi + shrq %cl, %rdi + test %rdi, %rdi + je L(back_to_loop) + bsfq %rdi, %rcx + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax + ret + + ALIGN (4) +L(cross_page_loop): + cmpb %cl, %al + jne L(different) + addq $1, %rdx + cmpq $64, %rdx + je L(main_loop_header) +L(cross_page): + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %ecx + testb %al, %al + jne L(cross_page_loop) + xorl %eax, %eax +L(different): + subl %ecx, %eax + ret +END (__strcmp_sse2_unaligned) |