diff options
Diffstat (limited to 'sysdeps/x86_64/multiarch/strlen-evex.S')
-rw-r--r-- | sysdeps/x86_64/multiarch/strlen-evex.S | 436 |
1 files changed, 436 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S new file mode 100644 index 0000000000..cd022509cb --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-evex.S @@ -0,0 +1,436 @@ +/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef STRLEN +# define STRLEN __strlen_evex +# endif + +# define VMOVA vmovdqa64 + +# ifdef USE_AS_WCSLEN +# define VPCMP vpcmpd +# define VPMINU vpminud +# define SHIFT_REG r9d +# else +# define VPCMP vpcmpb +# define VPMINU vpminub +# define SHIFT_REG ecx +# endif + +# define XMMZERO xmm16 +# define YMMZERO ymm16 +# define YMM1 ymm17 +# define YMM2 ymm18 +# define YMM3 ymm19 +# define YMM4 ymm20 +# define YMM5 ymm21 +# define YMM6 ymm22 + +# define VEC_SIZE 32 + + .section .text.evex,"ax",@progbits +ENTRY (STRLEN) +# ifdef USE_AS_STRNLEN + /* Check for zero length. */ + test %RSI_LP, %RSI_LP + jz L(zero) +# ifdef USE_AS_WCSLEN + shl $2, %RSI_LP +# elif defined __ILP32__ + /* Clear the upper 32 bits. */ + movl %esi, %esi +# endif + mov %RSI_LP, %R8_LP +# endif + movl %edi, %ecx + movq %rdi, %rdx + vpxorq %XMMZERO, %XMMZERO, %XMMZERO + + /* Check if we may cross page boundary with one vector load. */ + andl $(2 * VEC_SIZE - 1), %ecx + cmpl $VEC_SIZE, %ecx + ja L(cros_page_boundary) + + /* Check the first VEC_SIZE bytes. Each bit in K0 represents a + null byte. */ + VPCMP $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + +# ifdef USE_AS_STRNLEN + jnz L(first_vec_x0_check) + /* Adjust length and check the end of data. */ + subq $VEC_SIZE, %rsi + jbe L(max) +# else + jnz L(first_vec_x0) +# endif + + /* Align data for aligned loads in the loop. */ + addq $VEC_SIZE, %rdi + andl $(VEC_SIZE - 1), %ecx + andq $-VEC_SIZE, %rdi + +# ifdef USE_AS_STRNLEN + /* Adjust length. */ + addq %rcx, %rsi + + subq $(VEC_SIZE * 4), %rsi + jbe L(last_4x_vec_or_less) +# endif + jmp L(more_4x_vec) + + .p2align 4 +L(cros_page_boundary): + andl $(VEC_SIZE - 1), %ecx + andq $-VEC_SIZE, %rdi + +# ifdef USE_AS_WCSLEN + /* NB: Divide shift count by 4 since each bit in K0 represent 4 + bytes. */ + movl %ecx, %SHIFT_REG + sarl $2, %SHIFT_REG +# endif + VPCMP $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + + /* Remove the leading bytes. */ + sarxl %SHIFT_REG, %eax, %eax + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +# endif +# ifdef USE_AS_STRNLEN + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) +# endif + addq %rdi, %rax + addq %rcx, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(aligned_more): +# ifdef USE_AS_STRNLEN + /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" + with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" + to void possible addition overflow. */ + negq %rcx + addq $VEC_SIZE, %rcx + + /* Check the end of data. */ + subq %rcx, %rsi + jbe L(max) +# endif + + addq $VEC_SIZE, %rdi + +# ifdef USE_AS_STRNLEN + subq $(VEC_SIZE * 4), %rsi + jbe L(last_4x_vec_or_less) +# endif + +L(more_4x_vec): + /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ + VPCMP $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x0) + + VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x1) + + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x2) + + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x3) + + addq $(VEC_SIZE * 4), %rdi + +# ifdef USE_AS_STRNLEN + subq $(VEC_SIZE * 4), %rsi + jbe L(last_4x_vec_or_less) +# endif + + /* Align data to 4 * VEC_SIZE. */ + movq %rdi, %rcx + andl $(4 * VEC_SIZE - 1), %ecx + andq $-(4 * VEC_SIZE), %rdi + +# ifdef USE_AS_STRNLEN + /* Adjust length. */ + addq %rcx, %rsi +# endif + + .p2align 4 +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + VMOVA (%rdi), %YMM1 + VMOVA VEC_SIZE(%rdi), %YMM2 + VMOVA (VEC_SIZE * 2)(%rdi), %YMM3 + VMOVA (VEC_SIZE * 3)(%rdi), %YMM4 + + VPMINU %YMM1, %YMM2, %YMM5 + VPMINU %YMM3, %YMM4, %YMM6 + + VPMINU %YMM5, %YMM6, %YMM5 + VPCMP $0, %YMM5, %YMMZERO, %k0 + ktestd %k0, %k0 + jnz L(4x_vec_end) + + addq $(VEC_SIZE * 4), %rdi + +# ifndef USE_AS_STRNLEN + jmp L(loop_4x_vec) +# else + subq $(VEC_SIZE * 4), %rsi + ja L(loop_4x_vec) + +L(last_4x_vec_or_less): + /* Less than 4 * VEC and aligned to VEC_SIZE. */ + addl $(VEC_SIZE * 2), %esi + jle L(last_2x_vec) + + VPCMP $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x0) + + VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x1) + + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x2_check) + subl $VEC_SIZE, %esi + jle L(max) + + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x3_check) + movq %r8, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(last_2x_vec): + addl $(VEC_SIZE * 2), %esi + + VPCMP $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x0_check) + subl $VEC_SIZE, %esi + jle L(max) + + VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x1_check) + movq %r8, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(first_vec_x0_check): + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +# endif + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(first_vec_x1_check): + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +# endif + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) + addq $VEC_SIZE, %rax + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(first_vec_x2_check): + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +# endif + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) + addq $(VEC_SIZE * 2), %rax + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(first_vec_x3_check): + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +# endif + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) + addq $(VEC_SIZE * 3), %rax + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(max): + movq %r8, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(zero): + xorl %eax, %eax + ret +# endif + + .p2align 4 +L(first_vec_x0): + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +# endif + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(first_vec_x1): + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +# endif + addq $VEC_SIZE, %rax + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(first_vec_x2): + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +# endif + addq $(VEC_SIZE * 2), %rax + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + + .p2align 4 +L(4x_vec_end): + VPCMP $0, %YMM1, %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x0) + VPCMP $0, %YMM2, %YMMZERO, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(first_vec_x1) + VPCMP $0, %YMM3, %YMMZERO, %k2 + kmovd %k2, %eax + testl %eax, %eax + jnz L(first_vec_x2) + VPCMP $0, %YMM4, %YMMZERO, %k3 + kmovd %k3, %eax +L(first_vec_x3): + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +# endif + addq $(VEC_SIZE * 3), %rax + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + ret + +END (STRLEN) +#endif |