summaryrefslogtreecommitdiff
path: root/sysdeps/x86_64/multiarch/strrchr-evex.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/x86_64/multiarch/strrchr-evex.S')
-rw-r--r--sysdeps/x86_64/multiarch/strrchr-evex.S265
1 files changed, 265 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
new file mode 100644
index 0000000000..f920b5a584
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -0,0 +1,265 @@
+/* strrchr/wcsrchr optimized with 256-bit EVEX instructions.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRRCHR
+# define STRRCHR __strrchr_evex
+# endif
+
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+
+# ifdef USE_AS_WCSRCHR
+# define VPBROADCAST vpbroadcastd
+# define VPCMP vpcmpd
+# define SHIFT_REG r8d
+# else
+# define VPBROADCAST vpbroadcastb
+# define VPCMP vpcmpb
+# define SHIFT_REG ecx
+# endif
+
+# define XMMZERO xmm16
+# define YMMZERO ymm16
+# define YMMMATCH ymm17
+# define YMM1 ymm18
+
+# define VEC_SIZE 32
+
+ .section .text.evex,"ax",@progbits
+ENTRY (STRRCHR)
+ movl %edi, %ecx
+ /* Broadcast CHAR to YMMMATCH. */
+ VPBROADCAST %esi, %YMMMATCH
+
+ vpxorq %XMMZERO, %XMMZERO, %XMMZERO
+
+ /* Check if we may cross page boundary with one vector load. */
+ andl $(2 * VEC_SIZE - 1), %ecx
+ cmpl $VEC_SIZE, %ecx
+ ja L(cros_page_boundary)
+
+ VMOVU (%rdi), %YMM1
+
+ /* Each bit in K0 represents a null byte in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM1, %k0
+ /* Each bit in K1 represents a CHAR in YMM1. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k0, %ecx
+ kmovd %k1, %eax
+
+ addq $VEC_SIZE, %rdi
+
+ testl %eax, %eax
+ jnz L(first_vec)
+
+ testl %ecx, %ecx
+ jnz L(return_null)
+
+ andq $-VEC_SIZE, %rdi
+ xorl %edx, %edx
+ jmp L(aligned_loop)
+
+ .p2align 4
+L(first_vec):
+ /* Check if there is a null byte. */
+ testl %ecx, %ecx
+ jnz L(char_and_nul_in_first_vec)
+
+ /* Remember the match and keep searching. */
+ movl %eax, %edx
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rdi
+ jmp L(aligned_loop)
+
+ .p2align 4
+L(cros_page_boundary):
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+
+# ifdef USE_AS_WCSRCHR
+ /* NB: Divide shift count by 4 since each bit in K1 represent 4
+ bytes. */
+ movl %ecx, %SHIFT_REG
+ sarl $2, %SHIFT_REG
+# endif
+
+ VMOVA (%rdi), %YMM1
+
+ /* Each bit in K0 represents a null byte in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM1, %k0
+ /* Each bit in K1 represents a CHAR in YMM1. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k0, %edx
+ kmovd %k1, %eax
+
+ shrxl %SHIFT_REG, %edx, %edx
+ shrxl %SHIFT_REG, %eax, %eax
+ addq $VEC_SIZE, %rdi
+
+ /* Check if there is a CHAR. */
+ testl %eax, %eax
+ jnz L(found_char)
+
+ testl %edx, %edx
+ jnz L(return_null)
+
+ jmp L(aligned_loop)
+
+ .p2align 4
+L(found_char):
+ testl %edx, %edx
+ jnz L(char_and_nul)
+
+ /* Remember the match and keep searching. */
+ movl %eax, %edx
+ leaq (%rdi, %rcx), %rsi
+
+ .p2align 4
+L(aligned_loop):
+ VMOVA (%rdi), %YMM1
+ addq $VEC_SIZE, %rdi
+
+ /* Each bit in K0 represents a null byte in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM1, %k0
+ /* Each bit in K1 represents a CHAR in YMM1. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k0, %ecx
+ kmovd %k1, %eax
+ orl %eax, %ecx
+ jnz L(char_nor_null)
+
+ VMOVA (%rdi), %YMM1
+ add $VEC_SIZE, %rdi
+
+ /* Each bit in K0 represents a null byte in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM1, %k0
+ /* Each bit in K1 represents a CHAR in YMM1. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k0, %ecx
+ kmovd %k1, %eax
+ orl %eax, %ecx
+ jnz L(char_nor_null)
+
+ VMOVA (%rdi), %YMM1
+ addq $VEC_SIZE, %rdi
+
+ /* Each bit in K0 represents a null byte in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM1, %k0
+ /* Each bit in K1 represents a CHAR in YMM1. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k0, %ecx
+ kmovd %k1, %eax
+ orl %eax, %ecx
+ jnz L(char_nor_null)
+
+ VMOVA (%rdi), %YMM1
+ addq $VEC_SIZE, %rdi
+
+ /* Each bit in K0 represents a null byte in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM1, %k0
+ /* Each bit in K1 represents a CHAR in YMM1. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k0, %ecx
+ kmovd %k1, %eax
+ orl %eax, %ecx
+ jz L(aligned_loop)
+
+ .p2align 4
+L(char_nor_null):
+ /* Find a CHAR or a null byte in a loop. */
+ testl %eax, %eax
+ jnz L(match)
+L(return_value):
+ testl %edx, %edx
+ jz L(return_null)
+ movl %edx, %eax
+ movq %rsi, %rdi
+ bsrl %eax, %eax
+# ifdef USE_AS_WCSRCHR
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ leaq -VEC_SIZE(%rdi, %rax, 4), %rax
+# else
+ leaq -VEC_SIZE(%rdi, %rax), %rax
+# endif
+ ret
+
+ .p2align 4
+L(match):
+ /* Find a CHAR. Check if there is a null byte. */
+ kmovd %k0, %ecx
+ testl %ecx, %ecx
+ jnz L(find_nul)
+
+ /* Remember the match and keep searching. */
+ movl %eax, %edx
+ movq %rdi, %rsi
+ jmp L(aligned_loop)
+
+ .p2align 4
+L(find_nul):
+ /* Mask out any matching bits after the null byte. */
+ movl %ecx, %r8d
+ subl $1, %r8d
+ xorl %ecx, %r8d
+ andl %r8d, %eax
+ testl %eax, %eax
+ /* If there is no CHAR here, return the remembered one. */
+ jz L(return_value)
+ bsrl %eax, %eax
+# ifdef USE_AS_WCSRCHR
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ leaq -VEC_SIZE(%rdi, %rax, 4), %rax
+# else
+ leaq -VEC_SIZE(%rdi, %rax), %rax
+# endif
+ ret
+
+ .p2align 4
+L(char_and_nul):
+ /* Find both a CHAR and a null byte. */
+ addq %rcx, %rdi
+ movl %edx, %ecx
+L(char_and_nul_in_first_vec):
+ /* Mask out any matching bits after the null byte. */
+ movl %ecx, %r8d
+ subl $1, %r8d
+ xorl %ecx, %r8d
+ andl %r8d, %eax
+ testl %eax, %eax
+ /* Return null pointer if the null byte comes first. */
+ jz L(return_null)
+ bsrl %eax, %eax
+# ifdef USE_AS_WCSRCHR
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ leaq -VEC_SIZE(%rdi, %rax, 4), %rax
+# else
+ leaq -VEC_SIZE(%rdi, %rax), %rax
+# endif
+ ret
+
+ .p2align 4
+L(return_null):
+ xorl %eax, %eax
+ ret
+
+END (STRRCHR)
+#endif