diff options
Diffstat (limited to 'sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S')
-rw-r--r-- | sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S | 374 |
1 files changed, 0 insertions, 374 deletions
diff --git a/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S deleted file mode 100644 index 138979d10a..0000000000 --- a/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S +++ /dev/null @@ -1,374 +0,0 @@ -/* strstr with unaligned loads - Copyright (C) 2009-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -ENTRY(__strstr_sse2_unaligned) - movzbl (%rsi), %eax - testb %al, %al - je L(empty) - movzbl 1(%rsi), %edx - testb %dl, %dl - je L(strchr) - movd %eax, %xmm1 - movd %edx, %xmm2 - movq %rdi, %rax - andl $4095, %eax - punpcklbw %xmm1, %xmm1 - cmpq $4031, %rax - punpcklbw %xmm2, %xmm2 - punpcklwd %xmm1, %xmm1 - punpcklwd %xmm2, %xmm2 - pshufd $0, %xmm1, %xmm1 - pshufd $0, %xmm2, %xmm2 - ja L(cross_page) - movdqu (%rdi), %xmm3 - pxor %xmm5, %xmm5 - movdqu 1(%rdi), %xmm4 - movdqa %xmm3, %xmm6 - pcmpeqb %xmm1, %xmm3 - pcmpeqb %xmm2, %xmm4 - movdqu 16(%rdi), %xmm0 - pcmpeqb %xmm5, %xmm6 - pminub %xmm4, %xmm3 - movdqa %xmm3, %xmm4 - movdqu 17(%rdi), %xmm3 - pcmpeqb %xmm0, %xmm5 - pcmpeqb %xmm2, %xmm3 - por %xmm6, %xmm4 - pcmpeqb %xmm1, %xmm0 - pminub %xmm3, %xmm0 - por %xmm5, %xmm0 - pmovmskb %xmm4, %r8d - pmovmskb %xmm0, %eax - salq $16, %rax - orq %rax, %r8 - je L(next_32_bytes) -L(next_pair_index): - bsf %r8, %rax - addq %rdi, %rax - cmpb $0, (%rax) - je L(zero1) - movzbl 2(%rsi), %edx - testb %dl, %dl - je L(found1) - cmpb 2(%rax), %dl - jne L(next_pair) - xorl %edx, %edx - jmp L(pair_loop_start) - - .p2align 4 -L(strchr): - movzbl %al, %esi - jmp __strchr_sse2 - - .p2align 4 -L(pair_loop): - addq $1, %rdx - cmpb 2(%rax,%rdx), %cl - jne L(next_pair) -L(pair_loop_start): - movzbl 3(%rsi,%rdx), %ecx - testb %cl, %cl - jne L(pair_loop) -L(found1): - ret -L(zero1): - xorl %eax, %eax - ret - - .p2align 4 -L(next_pair): - leaq -1(%r8), %rax - andq %rax, %r8 - jne L(next_pair_index) - - .p2align 4 -L(next_32_bytes): - movdqu 32(%rdi), %xmm3 - pxor %xmm5, %xmm5 - movdqu 33(%rdi), %xmm4 - movdqa %xmm3, %xmm6 - pcmpeqb %xmm1, %xmm3 - pcmpeqb %xmm2, %xmm4 - movdqu 48(%rdi), %xmm0 - pcmpeqb %xmm5, %xmm6 - pminub %xmm4, %xmm3 - movdqa %xmm3, %xmm4 - movdqu 49(%rdi), %xmm3 - pcmpeqb %xmm0, %xmm5 - pcmpeqb %xmm2, %xmm3 - por %xmm6, %xmm4 - pcmpeqb %xmm1, %xmm0 - pminub %xmm3, %xmm0 - por %xmm5, %xmm0 - pmovmskb %xmm4, %eax - salq $32, %rax - pmovmskb %xmm0, %r8d - salq $48, %r8 - orq %rax, %r8 - je L(loop_header) -L(next_pair2_index): - bsfq %r8, %rax - addq %rdi, %rax - cmpb $0, (%rax) - je L(zero2) - movzbl 2(%rsi), %edx - testb %dl, %dl - je L(found2) - cmpb 2(%rax), %dl - jne L(next_pair2) - xorl %edx, %edx - jmp L(pair_loop2_start) - - .p2align 4 -L(pair_loop2): - addq $1, %rdx - cmpb 2(%rax,%rdx), %cl - jne L(next_pair2) -L(pair_loop2_start): - movzbl 3(%rsi,%rdx), %ecx - testb %cl, %cl - jne L(pair_loop2) -L(found2): - ret - L(zero2): - xorl %eax, %eax - ret -L(empty): - mov %rdi, %rax - ret - - .p2align 4 -L(next_pair2): - leaq -1(%r8), %rax - andq %rax, %r8 - jne L(next_pair2_index) -L(loop_header): - movq $-512, %r11 - movq %rdi, %r9 - - pxor %xmm7, %xmm7 - andq $-64, %rdi - - .p2align 4 -L(loop): - movdqa 64(%rdi), %xmm3 - movdqu 63(%rdi), %xmm6 - movdqa %xmm3, %xmm0 - pxor %xmm2, %xmm3 - pxor %xmm1, %xmm6 - movdqa 80(%rdi), %xmm10 - por %xmm3, %xmm6 - pminub %xmm10, %xmm0 - movdqu 79(%rdi), %xmm3 - pxor %xmm2, %xmm10 - pxor %xmm1, %xmm3 - movdqa 96(%rdi), %xmm9 - por %xmm10, %xmm3 - pminub %xmm9, %xmm0 - pxor %xmm2, %xmm9 - movdqa 112(%rdi), %xmm8 - addq $64, %rdi - pminub %xmm6, %xmm3 - movdqu 31(%rdi), %xmm4 - pminub %xmm8, %xmm0 - pxor %xmm2, %xmm8 - pxor %xmm1, %xmm4 - por %xmm9, %xmm4 - pminub %xmm4, %xmm3 - movdqu 47(%rdi), %xmm5 - pxor %xmm1, %xmm5 - por %xmm8, %xmm5 - pminub %xmm5, %xmm3 - pminub %xmm3, %xmm0 - pcmpeqb %xmm7, %xmm0 - pmovmskb %xmm0, %eax - testl %eax, %eax - je L(loop) - pminub (%rdi), %xmm6 - pminub 32(%rdi),%xmm4 - pminub 48(%rdi),%xmm5 - pcmpeqb %xmm7, %xmm6 - pcmpeqb %xmm7, %xmm5 - pmovmskb %xmm6, %edx - movdqa 16(%rdi), %xmm8 - pcmpeqb %xmm7, %xmm4 - movdqu 15(%rdi), %xmm0 - pmovmskb %xmm5, %r8d - movdqa %xmm8, %xmm3 - pmovmskb %xmm4, %ecx - pcmpeqb %xmm1,%xmm0 - pcmpeqb %xmm2,%xmm3 - salq $32, %rcx - pcmpeqb %xmm7,%xmm8 - salq $48, %r8 - pminub %xmm0,%xmm3 - orq %rcx, %rdx - por %xmm3,%xmm8 - orq %rdx, %r8 - pmovmskb %xmm8, %eax - salq $16, %rax - orq %rax, %r8 - je L(loop) -L(next_pair_index3): - bsfq %r8, %rcx - addq %rdi, %rcx - cmpb $0, (%rcx) - je L(zero) - xorl %eax, %eax - movzbl 2(%rsi), %edx - testb %dl, %dl - je L(success3) - cmpb 1(%rcx), %dl - jne L(next_pair3) - jmp L(pair_loop_start3) - - .p2align 4 -L(pair_loop3): - addq $1, %rax - cmpb 1(%rcx,%rax), %dl - jne L(next_pair3) -L(pair_loop_start3): - movzbl 3(%rsi,%rax), %edx - testb %dl, %dl - jne L(pair_loop3) -L(success3): - lea -1(%rcx), %rax - ret - - .p2align 4 -L(next_pair3): - addq %rax, %r11 - movq %rdi, %rax - subq %r9, %rax - cmpq %r11, %rax - jl L(switch_strstr) - leaq -1(%r8), %rax - andq %rax, %r8 - jne L(next_pair_index3) - jmp L(loop) - - .p2align 4 -L(switch_strstr): - movq %rdi, %rdi - jmp __strstr_sse2 - - .p2align 4 -L(cross_page): - - movq %rdi, %rax - pxor %xmm0, %xmm0 - andq $-64, %rax - movdqa (%rax), %xmm3 - movdqu -1(%rax), %xmm4 - movdqa %xmm3, %xmm8 - movdqa 16(%rax), %xmm5 - pcmpeqb %xmm1, %xmm4 - pcmpeqb %xmm0, %xmm8 - pcmpeqb %xmm2, %xmm3 - movdqa %xmm5, %xmm7 - pminub %xmm4, %xmm3 - movdqu 15(%rax), %xmm4 - pcmpeqb %xmm0, %xmm7 - por %xmm3, %xmm8 - movdqa %xmm5, %xmm3 - movdqa 32(%rax), %xmm5 - pcmpeqb %xmm1, %xmm4 - pcmpeqb %xmm2, %xmm3 - movdqa %xmm5, %xmm6 - pmovmskb %xmm8, %ecx - pminub %xmm4, %xmm3 - movdqu 31(%rax), %xmm4 - por %xmm3, %xmm7 - movdqa %xmm5, %xmm3 - pcmpeqb %xmm0, %xmm6 - movdqa 48(%rax), %xmm5 - pcmpeqb %xmm1, %xmm4 - pmovmskb %xmm7, %r8d - pcmpeqb %xmm2, %xmm3 - pcmpeqb %xmm5, %xmm0 - pminub %xmm4, %xmm3 - movdqu 47(%rax), %xmm4 - por %xmm3, %xmm6 - movdqa %xmm5, %xmm3 - salq $16, %r8 - pcmpeqb %xmm1, %xmm4 - pcmpeqb %xmm2, %xmm3 - pmovmskb %xmm6, %r10d - pminub %xmm4, %xmm3 - por %xmm3, %xmm0 - salq $32, %r10 - orq %r10, %r8 - orq %rcx, %r8 - movl %edi, %ecx - pmovmskb %xmm0, %edx - subl %eax, %ecx - salq $48, %rdx - orq %rdx, %r8 - shrq %cl, %r8 - je L(loop_header) -L(next_pair_index4): - bsfq %r8, %rax - addq %rdi, %rax - cmpb $0, (%rax) - je L(zero) - - cmpq %rax,%rdi - je L(next_pair4) - - movzbl 2(%rsi), %edx - testb %dl, %dl - je L(found3) - cmpb 1(%rax), %dl - jne L(next_pair4) - xorl %edx, %edx - jmp L(pair_loop_start4) - - .p2align 4 -L(pair_loop4): - addq $1, %rdx - cmpb 1(%rax,%rdx), %cl - jne L(next_pair4) -L(pair_loop_start4): - movzbl 3(%rsi,%rdx), %ecx - testb %cl, %cl - jne L(pair_loop4) -L(found3): - subq $1, %rax - ret - - .p2align 4 -L(next_pair4): - leaq -1(%r8), %rax - andq %rax, %r8 - jne L(next_pair_index4) - jmp L(loop_header) - - .p2align 4 -L(found): - rep - ret - - .p2align 4 -L(zero): - xorl %eax, %eax - ret - - -END(__strstr_sse2_unaligned) |