diff options
-rw-r--r-- | sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S | 22 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/le/power10/strlen.S | 160 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/Makefile | 4 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c | 4 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S | 21 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c | 4 |
6 files changed, 188 insertions, 27 deletions
diff --git a/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S b/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S new file mode 100644 index 0000000000..5351c2634f --- /dev/null +++ b/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S @@ -0,0 +1,22 @@ +/* Optimized rawmemchr implementation for POWER10 LE. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#define USE_AS_RAWMEMCHR 1 +#include <sysdeps/powerpc/powerpc64/le/power10/strlen.S> diff --git a/sysdeps/powerpc/powerpc64/le/power10/strlen.S b/sysdeps/powerpc/powerpc64/le/power10/strlen.S index ca7e9eb3d8..dda5282f1b 100644 --- a/sysdeps/powerpc/powerpc64/le/power10/strlen.S +++ b/sysdeps/powerpc/powerpc64/le/power10/strlen.S @@ -18,10 +18,50 @@ #include <sysdep.h> -#ifndef STRLEN -# define STRLEN __strlen -# define DEFINE_STRLEN_HIDDEN_DEF 1 -#endif +/* To reuse the code for rawmemchr, we have some extra steps compared to the + strlen implementation: + - Sum the initial value of r3 with the position at which the char was + found, to guarantee we return a pointer and not the length. + - In the main loop, subtract each byte by the char we are looking for, + so we can keep using vminub to quickly check 64B at once. */ +#ifdef USE_AS_RAWMEMCHR +# ifndef RAWMEMCHR +# define FUNCNAME __rawmemchr +# else +# define FUNCNAME RAWMEMCHR +# endif +# define MCOUNT_NARGS 2 +# define VREG_ZERO v20 +# define OFF_START_LOOP 256 +# define RAWMEMCHR_SUBTRACT_VECTORS \ + vsububm v4,v4,v18; \ + vsububm v5,v5,v18; \ + vsububm v6,v6,v18; \ + vsububm v7,v7,v18; +# define TAIL(vreg,increment) \ + vctzlsbb r4,vreg; \ + addi r4,r4,increment; \ + add r3,r5,r4; \ + blr + +#else /* strlen */ + +# ifndef STRLEN +# define FUNCNAME __strlen +# define DEFINE_STRLEN_HIDDEN_DEF 1 +# else +# define FUNCNAME STRLEN +# endif +# define MCOUNT_NARGS 1 +# define VREG_ZERO v18 +# define OFF_START_LOOP 192 +# define TAIL(vreg,increment) \ + vctzlsbb r4,vreg; \ + subf r3,r3,r5; \ + addi r4,r4,increment; \ + add r3,r3,r4; \ + blr +#endif /* USE_AS_RAWMEMCHR */ /* TODO: Replace macros by the actual instructions when minimum binutils becomes >= 2.35. This is used to keep compatibility with older versions. */ @@ -50,33 +90,41 @@ li r6,offset; \ LXVP(v4+32,offset,addr); \ LXVP(v6+32,offset+32,addr); \ + RAWMEMCHR_SUBTRACT_VECTORS; \ vminub v14,v4,v5; \ vminub v15,v6,v7; \ vminub v16,v14,v15; \ - vcmpequb. v0,v16,v18; \ + vcmpequb. v0,v16,VREG_ZERO; \ bne cr6,L(label) -#define TAIL(vreg,increment) \ - vctzlsbb r4,vreg; \ - subf r3,r3,r5; \ - addi r4,r4,increment; \ - add r3,r3,r4; \ - blr - /* Implements the function int [r3] strlen (const void *s [r3]) + but when USE_AS_RAWMEMCHR is set, implements the function + + void* [r3] rawmemchr (const void *s [r3], int c [r4]) + The implementation can load bytes past a matching byte, but only up to the next 64B boundary, so it never crosses a page. */ .machine power9 -ENTRY_TOCLESS (STRLEN, 4) - CALL_MCOUNT 1 +ENTRY_TOCLESS (FUNCNAME, 4) + CALL_MCOUNT MCOUNT_NARGS - vspltisb v18,0 +#ifdef USE_AS_RAWMEMCHR + xori r5,r4,0xff + + mtvsrd v18+32,r4 /* matching char in v18 */ + mtvsrd v19+32,r5 /* non matching char in v19 */ + + vspltb v18,v18,7 /* replicate */ + vspltb v19,v19,7 /* replicate */ +#else vspltisb v19,-1 +#endif + vspltisb VREG_ZERO,0 /* Next 16B-aligned address. Prepare address for L(aligned). */ addi r5,r3,16 @@ -90,16 +138,25 @@ ENTRY_TOCLESS (STRLEN, 4) vcmpequb. v6,v0,v18 beq cr6,L(aligned) +#ifdef USE_AS_RAWMEMCHR + vctzlsbb r6,v6 + add r3,r3,r6 +#else vctzlsbb r3,v6 +#endif blr - /* Test next 176B, 16B at a time. The main loop is optimized for longer - strings, so checking the first bytes in 16B chunks benefits a lot - small strings. */ + /* Test up to OFF_START_LOOP-16 bytes in 16B chunks. The main loop is + optimized for longer strings, so checking the first bytes in 16B + chunks benefits a lot small strings. */ .p2align 5 L(aligned): +#ifdef USE_AS_RAWMEMCHR + cmpdi cr5,r4,0 /* Check if c == 0. This will be useful to + choose how we will perform the main loop. */ +#endif /* Prepare address for the loop. */ - addi r4,r3,192 + addi r4,r3,OFF_START_LOOP clrrdi r4,r4,6 CHECK16(v0,0,r5,tail1) @@ -113,15 +170,43 @@ L(aligned): CHECK16(v8,128,r5,tail9) CHECK16(v9,144,r5,tail10) CHECK16(v10,160,r5,tail11) +#ifdef USE_AS_RAWMEMCHR + CHECK16(v0,176,r5,tail12) + CHECK16(v1,192,r5,tail13) + CHECK16(v2,208,r5,tail14) + CHECK16(v3,224,r5,tail15) +#endif addi r5,r4,128 +#ifdef USE_AS_RAWMEMCHR + /* If c == 0, use the same loop as strlen, without the vsububm. */ + beq cr5,L(loop) + + /* This is very similar to the block after L(loop), the difference is + that here RAWMEMCHR_SUBTRACT_VECTORS is not empty, and we subtract + each byte loaded by the char we are looking for, this way we can keep + using vminub to merge the results and checking for nulls. */ + .p2align 5 +L(rawmemchr_loop): + CHECK64(0,r4,pre_tail_64b) + CHECK64(64,r4,pre_tail_64b) + addi r4,r4,256 + + CHECK64(0,r5,tail_64b) + CHECK64(64,r5,tail_64b) + addi r5,r5,256 + + b L(rawmemchr_loop) +#endif /* Switch to a more aggressive approach checking 64B each time. Use 2 pointers 128B apart and unroll the loop once to make the pointer updates and usages separated enough to avoid stalls waiting for address calculation. */ .p2align 5 L(loop): +#undef RAWMEMCHR_SUBTRACT_VECTORS +#define RAWMEMCHR_SUBTRACT_VECTORS /* nothing */ CHECK64(0,r4,pre_tail_64b) CHECK64(64,r4,pre_tail_64b) addi r4,r4,256 @@ -140,10 +225,10 @@ L(tail_64b): block and mark it in its corresponding VR. lxvp vx,0(ry) puts the low 16B bytes into vx+1, and the high into vx, so the order here is v5, v4, v7, v6. */ - vcmpequb v1,v5,v18 - vcmpequb v2,v4,v18 - vcmpequb v3,v7,v18 - vcmpequb v4,v6,v18 + vcmpequb v1,v5,VREG_ZERO + vcmpequb v2,v4,VREG_ZERO + vcmpequb v3,v7,VREG_ZERO + vcmpequb v4,v6,VREG_ZERO /* Take into account the other 64B blocks we had already checked. */ add r5,r5,r6 @@ -165,7 +250,9 @@ L(tail_64b): or r10,r8,r7 cnttzd r0,r10 /* Count trailing zeros before the match. */ +#ifndef USE_AS_RAWMEMCHR subf r5,r3,r5 +#endif add r3,r5,r0 /* Compute final length. */ blr @@ -213,9 +300,32 @@ L(tail10): L(tail11): TAIL(v10,160) -END (STRLEN) +#ifdef USE_AS_RAWMEMCHR + .p2align 5 +L(tail12): + TAIL(v0,176) + + .p2align 5 +L(tail13): + TAIL(v1,192) + + .p2align 5 +L(tail14): + TAIL(v2,208) + + .p2align 5 +L(tail15): + TAIL(v3,224) +#endif + +END (FUNCNAME) -#ifdef DEFINE_STRLEN_HIDDEN_DEF +#ifdef USE_AS_RAWMEMCHR +weak_alias (__rawmemchr,rawmemchr) +libc_hidden_builtin_def (__rawmemchr) +#else +# ifdef DEFINE_STRLEN_HIDDEN_DEF weak_alias (__strlen, strlen) libc_hidden_builtin_def (strlen) +# endif #endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index ea50b61674..e6e013db17 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -33,9 +33,9 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ ifneq (,$(filter %le,$(config-machine))) sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \ + rawmemchr-power9 rawmemchr-power10 \ strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ - rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \ - strlen-power10 + strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10 endif CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index b123c6a3d3..a92b67448e 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -258,6 +258,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, rawmemchr, #ifdef __LITTLE_ENDIAN__ IFUNC_IMPL_ADD (array, i, rawmemchr, + (hwcap2 & PPC_FEATURE2_ARCH_3_1) + && (hwcap & PPC_FEATURE_HAS_VSX), + __rawmemchr_power10) + IFUNC_IMPL_ADD (array, i, rawmemchr, hwcap2 & PPC_FEATURE2_ARCH_3_00, __rawmemchr_power9) #endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S new file mode 100644 index 0000000000..bf1ed7e194 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S @@ -0,0 +1,21 @@ +/* Optimized rawmemchr implementation for PowerPC64/POWER10. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define RAWMEMCHR __rawmemchr_power10 + +#include <sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c index d906a4ea50..c0ffea2b93 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c +++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c @@ -26,6 +26,7 @@ extern __typeof (__rawmemchr) __rawmemchr_ppc attribute_hidden; extern __typeof (__rawmemchr) __rawmemchr_power7 attribute_hidden; # ifdef __LITTLE_ENDIAN__ extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden; +extern __typeof (__rawmemchr) __rawmemchr_power10 attribute_hidden; # endif # undef __rawmemchr @@ -34,6 +35,9 @@ extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden; ifunc symbol properly. */ libc_ifunc_redirected (__redirect___rawmemchr, __rawmemchr, # ifdef __LITTLE_ENDIAN__ + (hwcap2 & PPC_FEATURE2_ARCH_3_1) + && (hwcap & PPC_FEATURE_HAS_VSX) + ? __rawmemchr_power10 : (hwcap2 & PPC_FEATURE2_ARCH_3_00) ? __rawmemchr_power9 : # endif |