From 093ecf92998de275820296058ad5648e354b9e0d Mon Sep 17 00:00:00 2001 From: Liubov Dmitrieva Date: Fri, 7 Oct 2011 11:49:10 -0400 Subject: Improve 64 bit memchr, memrchr, rawmemchr with SSE2 --- sysdeps/x86_64/memchr.S | 316 +++++++++++++++++++++++++---- sysdeps/x86_64/memrchr.S | 380 +++++++++++++++++++++++++++++++++++ sysdeps/x86_64/multiarch/rawmemchr.S | 10 +- sysdeps/x86_64/rawmemchr.S | 195 ++++++++++++++++-- 4 files changed, 842 insertions(+), 59 deletions(-) create mode 100644 sysdeps/x86_64/memrchr.S (limited to 'sysdeps') diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S index 6082aa7f76..895a014f5e 100644 --- a/sysdeps/x86_64/memchr.S +++ b/sysdeps/x86_64/memchr.S @@ -1,5 +1,5 @@ -/* Copyright (C) 2009 Free Software Foundation, Inc. - Contributed by Ulrich Drepper . +/* Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -19,50 +19,294 @@ #include +/* fast SSE2 version with using pmaxub and 64 byte loop */ .text -ENTRY (memchr) - movd %esi, %xmm1 - movq %rdi, %rcx +ENTRY(memchr) + movd %rsi, %xmm1 + mov %rdi, %rcx + punpcklbw %xmm1, %xmm1 - andq $~15, %rdi - testq %rdx, %rdx + test %rdx, %rdx + jz L(return_null) punpcklbw %xmm1, %xmm1 - jz 3f - orl $0xffffffff, %esi - movdqa (%rdi), %xmm0 + + and $63, %rcx pshufd $0, %xmm1, %xmm1 - subq %rdi, %rcx + + cmp $48, %rcx + ja L(crosscache) + + movdqu (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + + jnz L(matches_1) + sub $16, %rdx + jbe L(return_null) + add $16, %rdi + and $15, %rcx + and $-16, %rdi + add %rcx, %rdx + sub $64, %rdx + jbe L(exit_loop) + jmp L(loop_prolog) + + .p2align 4 +L(crosscache): + and $15, %rcx + and $-16, %rdi + movdqa (%rdi), %xmm0 + + pcmpeqb %xmm1, %xmm0 +/* Check if there is a match. */ + pmovmskb %xmm0, %eax +/* Remove the leading bytes. */ + sar %cl, %eax + test %eax, %eax + je L(unaligned_no_match) +/* Check which byte is a match. */ + bsf %eax, %eax + + sub %rax, %rdx + jbe L(return_null) + add %rdi, %rax + add %rcx, %rax + ret + + .p2align 4 +L(unaligned_no_match): + add %rcx, %rdx + sub $16, %rdx + jbe L(return_null) + add $16, %rdi + sub $64, %rdx + jbe L(exit_loop) + + .p2align 4 +L(loop_prolog): + movdqa (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm4 + pcmpeqb %xmm1, %xmm4 + add $64, %rdi + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + + test $0x3f, %rdi + jz L(align64_loop) + + sub $64, %rdx + jbe L(exit_loop) + + movdqa (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + + add $64, %rdi + test %eax, %eax + jnz L(matches0) + + mov %rdi, %rcx + and $-64, %rdi + and $63, %rcx + add %rcx, %rdx + + .p2align 4 +L(align64_loop): + sub $64, %rdx + jbe L(exit_loop) + movdqa (%rdi), %xmm0 + movdqa 16(%rdi), %xmm2 + movdqa 32(%rdi), %xmm3 + movdqa 48(%rdi), %xmm4 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 + pmovmskb %xmm4, %eax + + add $64, %rdi + + test %eax, %eax + jz L(align64_loop) + + sub $64, %rdi + + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + + pcmpeqb 48(%rdi), %xmm1 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + pmovmskb %xmm1, %eax + bsf %eax, %eax + lea 48(%rdi, %rax), %rax + ret + + .p2align 4 +L(exit_loop): + add $32, %rdx + jle L(exit_loop_32) + + movdqa (%rdi), %xmm0 pcmpeqb %xmm1, %xmm0 - addq %rcx, %rdx - shl %cl, %esi - pmovmskb %xmm0, %ecx - andl %esi, %ecx - movl $16, %esi - jnz 1f - cmpq %rsi, %rdx - jbe 3f - -2: movdqa (%rdi,%rsi), %xmm0 - leaq 16(%rsi), %rsi + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32_1) + sub $16, %rdx + jle L(return_null) + + pcmpeqb 48(%rdi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches48_1) + xor %rax, %rax + ret + + .p2align 4 +L(exit_loop_32): + add $32, %rdx + movdqa (%rdi), %xmm0 pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %ecx - testl %ecx, %ecx - jnz 1f - cmpq %rsi, %rdx - ja 2b + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches_1) + sub $16, %rdx + jbe L(return_null) -3: xorl %eax, %eax + pcmpeqb 16(%rdi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches16_1) + xor %rax, %rax ret -1: leaq -16(%rdi,%rsi), %rax - bsfl %ecx, %ecx - addq %rcx, %rax - leaq -16(%rsi,%rcx), %rsi - cmpq %rsi, %rdx - jbe 3b + .p2align 4 +L(matches0): + bsf %eax, %eax + lea -16(%rax, %rdi), %rax ret -END (memchr) + + .p2align 4 +L(matches): + bsf %eax, %eax + add %rdi, %rax + ret + + .p2align 4 +L(matches16): + bsf %eax, %eax + lea 16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches32): + bsf %eax, %eax + lea 32(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches_1): + bsf %eax, %eax + sub %rax, %rdx + jbe L(return_null) + add %rdi, %rax + ret + + .p2align 4 +L(matches16_1): + bsf %eax, %eax + sub %rax, %rdx + jbe L(return_null) + lea 16(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches32_1): + bsf %eax, %eax + sub %rax, %rdx + jbe L(return_null) + lea 32(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches48_1): + bsf %eax, %eax + sub %rax, %rdx + jbe L(return_null) + lea 48(%rdi, %rax), %rax + ret + + .p2align 4 +L(return_null): + xor %rax, %rax + ret +END(memchr) strong_alias (memchr, __memchr) -libc_hidden_builtin_def (memchr) + +libc_hidden_builtin_def(memchr) diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S new file mode 100644 index 0000000000..a85dc6b03b --- /dev/null +++ b/sysdeps/x86_64/memrchr.S @@ -0,0 +1,380 @@ +/* fast SSE2 memrchr with 64 byte loop and pmaxub instruction using + + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include + + .text +ENTRY (memrchr) + movd %rsi, %xmm1 + + sub $16, %rdx + jbe L(length_less16) + + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + + add %rdx, %rdi + pshufd $0, %xmm1, %xmm1 + + movdqu (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + +/* Check if there is a match. */ + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches0) + + sub $64, %rdi + mov %rdi, %rcx + and $15, %rcx + jz L(loop_prolog) + + add $16, %rdi + add $16, %rdx + and $-16, %rdi + sub %rcx, %rdx + + .p2align 4 +L(loop_prolog): + sub $64, %rdx + jbe L(exit_loop) + + movdqa 48(%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%rdi), %xmm4 + pcmpeqb %xmm1, %xmm4 + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + + sub $64, %rdi + sub $64, %rdx + jbe L(exit_loop) + + movdqa 48(%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches0) + + mov %rdi, %rcx + and $63, %rcx + jz L(align64_loop) + + add $64, %rdi + add $64, %rdx + and $-64, %rdi + sub %rcx, %rdx + + .p2align 4 +L(align64_loop): + sub $64, %rdi + sub $64, %rdx + jbe L(exit_loop) + + movdqa (%rdi), %xmm0 + movdqa 16(%rdi), %xmm2 + movdqa 32(%rdi), %xmm3 + movdqa 48(%rdi), %xmm4 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm3, %xmm0 + pmaxub %xmm4, %xmm2 + pmaxub %xmm0, %xmm2 + pmovmskb %xmm2, %eax + + test %eax, %eax + jz L(align64_loop) + + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches48) + + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%rdi), %xmm2 + + pcmpeqb %xmm1, %xmm2 + pcmpeqb (%rdi), %xmm1 + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + pmovmskb %xmm1, %eax + bsr %eax, %eax + + add %rdi, %rax + ret + + .p2align 4 +L(exit_loop): + add $64, %rdx + cmp $32, %rdx + jbe L(exit_loop_32) + + movdqa 48(%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16_1) + cmp $48, %rdx + jbe L(return_null) + + pcmpeqb (%rdi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches0_1) + xor %eax, %eax + ret + + .p2align 4 +L(exit_loop_32): + movdqa 48(%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48_1) + cmp $16, %rdx + jbe L(return_null) + + pcmpeqb 32(%rdi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches32_1) + xor %eax, %eax + ret + + .p2align 4 +L(matches0): + bsr %eax, %eax + add %rdi, %rax + ret + + .p2align 4 +L(matches16): + bsr %eax, %eax + lea 16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches32): + bsr %eax, %eax + lea 32(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches48): + bsr %eax, %eax + lea 48(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches0_1): + bsr %eax, %eax + sub $64, %rdx + add %rax, %rdx + jl L(return_null) + add %rdi, %rax + ret + + .p2align 4 +L(matches16_1): + bsr %eax, %eax + sub $48, %rdx + add %rax, %rdx + jl L(return_null) + lea 16(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches32_1): + bsr %eax, %eax + sub $32, %rdx + add %rax, %rdx + jl L(return_null) + lea 32(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches48_1): + bsr %eax, %eax + sub $16, %rdx + add %rax, %rdx + jl L(return_null) + lea 48(%rdi, %rax), %rax + ret + + .p2align 4 +L(return_null): + xor %rax, %rax + ret + + .p2align 4 +L(length_less16_offset0): + mov %dl, %cl + pcmpeqb (%rdi), %xmm1 + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + pmovmskb %xmm1, %eax + + and %edx, %eax + test %eax, %eax + jz L(return_null) + + bsr %eax, %eax + add %rdi, %rax + ret + + .p2align 4 +L(length_less16): + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + + add $16, %rdx + + pshufd $0, %xmm1, %xmm1 + + mov %rdi, %rcx + and $15, %rcx + jz L(length_less16_offset0) + + mov %rdi, %rcx + and $15, %rcx + mov %cl, %dh + mov %rcx, %r8 + add %dl, %dh + and $-16, %rdi + + sub $16, %dh + ja L(length_less16_part2) + + pcmpeqb (%rdi), %xmm1 + pmovmskb %xmm1, %eax + + sar %cl, %eax + mov %dl, %cl + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %eax + test %eax, %eax + jz L(return_null) + + bsr %eax, %eax + add %rdi, %rax + add %r8, %rax + ret + + .p2align 4 +L(length_less16_part2): + movdqa 16(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + + mov %dh, %cl + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %eax + + test %eax, %eax + jnz L(length_less16_part2_return) + + pcmpeqb (%rdi), %xmm1 + pmovmskb %xmm1, %eax + + mov %r8, %rcx + sar %cl, %eax + test %eax, %eax + jz L(return_null) + + bsr %eax, %eax + add %rdi, %rax + add %r8, %rax + ret + + .p2align 4 +L(length_less16_part2_return): + bsr %eax, %eax + lea 16(%rax, %rdi), %rax + ret + +END (memrchr) +strong_alias (memrchr, __memrchr) diff --git a/sysdeps/x86_64/multiarch/rawmemchr.S b/sysdeps/x86_64/multiarch/rawmemchr.S index 2a8a6909ef..a8933fb55a 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr.S +++ b/sysdeps/x86_64/multiarch/rawmemchr.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2009 Free Software Foundation, Inc. +/* Copyright (C) 2009, 2011 Free Software Foundation, Inc. Contributed by Ulrich Drepper . This file is part of the GNU C Library. @@ -29,11 +29,15 @@ ENTRY(rawmemchr) cmpl $0, __cpu_features+KIND_OFFSET(%rip) jne 1f call __init_cpu_features -1: leaq __rawmemchr_sse2(%rip), %rax +1: testl $bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip) + jnz 2f testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) jz 2f leaq __rawmemchr_sse42(%rip), %rax -2: ret + ret +2: leaq __rawmemchr_sse2(%rip), %rax + ret + END(rawmemchr) strong_alias (rawmemchr, __rawmemchr) diff --git a/sysdeps/x86_64/rawmemchr.S b/sysdeps/x86_64/rawmemchr.S index cfb4cebf68..a68b52c45e 100644 --- a/sysdeps/x86_64/rawmemchr.S +++ b/sysdeps/x86_64/rawmemchr.S @@ -1,5 +1,7 @@ -/* Copyright (C) 2009 Free Software Foundation, Inc. - Contributed by Ulrich Drepper . +/* fast SSE2 memchr with 64 byte loop and pmaxub instruction using + + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -19,34 +21,187 @@ #include - .text ENTRY (rawmemchr) - movd %esi, %xmm1 - movq %rdi, %rcx + movd %rsi, %xmm1 + mov %rdi, %rcx + punpcklbw %xmm1, %xmm1 - andq $~15, %rdi punpcklbw %xmm1, %xmm1 - orl $0xffffffff, %esi - movdqa (%rdi), %xmm0 + + and $63, %rcx pshufd $0, %xmm1, %xmm1 - subq %rdi, %rcx + + cmp $48, %rcx + ja L(crosscache) + + movdqu (%rdi), %xmm0 pcmpeqb %xmm1, %xmm0 - shl %cl, %esi - pmovmskb %xmm0, %ecx - andl %esi, %ecx - jnz 1f +/* Check if there is a match. */ + pmovmskb %xmm0, %eax + test %eax, %eax + + jnz L(matches) + add $16, %rdi + and $-16, %rdi + jmp L(loop_prolog) + + .p2align 4 +L(crosscache): + and $15, %rcx + and $-16, %rdi + movdqa (%rdi), %xmm0 -2: movdqa 16(%rdi), %xmm0 - leaq 16(%rdi), %rdi pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %ecx - testl %ecx, %ecx - jz 2b +/* Check if there is a match. */ + pmovmskb %xmm0, %eax +/* Remove the leading bytes. */ + sar %cl, %eax + test %eax, %eax + je L(unaligned_no_match) +/* Check which byte is a match. */ + bsf %eax, %eax -1: bsfl %ecx, %eax - addq %rdi, %rax + add %rdi, %rax + add %rcx, %rax ret + + .p2align 4 +L(unaligned_no_match): + add $16, %rdi + + .p2align 4 +L(loop_prolog): + movdqa (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm4 + pcmpeqb %xmm1, %xmm4 + add $64, %rdi + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + + test $0x3f, %rdi + jz L(align64_loop) + + movdqa (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + + add $64, %rdi + test %eax, %eax + jnz L(matches0) + + and $-64, %rdi + + .p2align 4 +L(align64_loop): + movdqa (%rdi), %xmm0 + movdqa 16(%rdi), %xmm2 + movdqa 32(%rdi), %xmm3 + movdqa 48(%rdi), %xmm4 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 + pmovmskb %xmm4, %eax + + add $64, %rdi + + test %eax, %eax + jz L(align64_loop) + + sub $64, %rdi + + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + + pcmpeqb 48(%rdi), %xmm1 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + pmovmskb %xmm1, %eax + bsf %eax, %eax + lea 48(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches0): + bsf %eax, %eax + lea -16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches): + bsf %eax, %eax + add %rdi, %rax + ret + + .p2align 4 +L(matches16): + bsf %eax, %eax + lea 16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches32): + bsf %eax, %eax + lea 32(%rax, %rdi), %rax + ret + + .p2align 4 +L(return_null): + xor %rax, %rax + ret + END (rawmemchr) strong_alias (rawmemchr, __rawmemchr) -- cgit v1.2.3