diff options
Diffstat (limited to 'sysdeps/powerpc')
-rw-r--r-- | sysdeps/powerpc/powerpc64/le/power10/memcpy.S | 198 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c | 6 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/memcpy-power10.S | 26 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/multiarch/memcpy.c | 7 |
5 files changed, 238 insertions, 1 deletions
diff --git a/sysdeps/powerpc/powerpc64/le/power10/memcpy.S b/sysdeps/powerpc/powerpc64/le/power10/memcpy.S new file mode 100644 index 0000000000..ad1414db4a --- /dev/null +++ b/sysdeps/powerpc/powerpc64/le/power10/memcpy.S @@ -0,0 +1,198 @@ +/* Optimized memcpy implementation for POWER10. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + +#ifndef MEMCPY +# define MEMCPY memcpy +#endif + +/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]); + Returns 'dst'. */ + + .machine power9 +ENTRY_TOCLESS (MEMCPY, 5) + CALL_MCOUNT 3 + + /* Copy up to 16 bytes. */ + sldi r6,r5,56 /* Prepare [l|st]xvl counter. */ + lxvl v10,r4,r6 + stxvl v10,r3,r6 + subic. r6,r5,16 /* Return if len <= 16. */ + blelr + + /* If len >= 256, assume nothing got copied before and copy + again. This might cause issues with overlapped memory, but memcpy + is not expected to treat overlapped memory. */ + cmpdi r5,256 + bge L(copy_ge_256) + /* 16 < len < 256 and the first 16 bytes have already been copied. */ + addi r10,r3,16 /* Keep r3 intact as return value. */ + addi r4,r4,16 + subi r5,r5,16 + b L(copy_lt_256) /* Avoid the main loop if len < 256. */ + + .p2align 5 +L(copy_ge_256): + mr r10,r3 /* Keep r3 intact as return value. */ + /* Align dst to 16 bytes. */ + andi. r9,r10,0xf + beq L(dst_is_align_16) + lxv v10,0(r4) + subfic r12,r9,16 + subf r5,r12,r5 + add r4,r4,r12 + stxv v10,0(r3) + add r10,r3,r12 + +L(dst_is_align_16): + srdi r9,r5,7 /* Divide by 128. */ + mtctr r9 + addi r6,r4,64 + addi r7,r10,64 + + + /* Main loop, copy 128 bytes per iteration. + Use r6=src+64 and r7=dest+64 in order to reduce the dependency on + r4 and r10. */ + .p2align 5 +L(copy_128): + + lxv v10, 0(r4) + lxv v11, 16(r4) + lxv v12, 32(r4) + lxv v13, 48(r4) + + addi r4,r4,128 + + stxv v10, 0(r10) + stxv v11, 16(r10) + stxv v12, 32(r10) + stxv v13, 48(r10) + + addi r10,r10,128 + + lxv v10, 0(r6) + lxv v11, 16(r6) + lxv v12, 32(r6) + lxv v13, 48(r6) + + addi r6,r6,128 + + stxv v10, 0(r7) + stxv v11, 16(r7) + stxv v12, 32(r7) + stxv v13, 48(r7) + + addi r7,r7,128 + + bdnz L(copy_128) + + clrldi. r5,r5,64-7 /* Have we copied everything? */ + beqlr + + .p2align 5 +L(copy_lt_256): + cmpdi r5,16 + ble L(copy_le_16) + srdi. r9,r5,5 /* Divide by 32. */ + beq L(copy_lt_32) + mtctr r9 + /* Use r6=src+32, r7=dest+32, r8=src+64, r9=dest+64 in order to reduce + the dependency on r4 and r10. */ + addi r6,r4,32 + addi r7,r10,32 + addi r8,r4,64 + addi r9,r10,64 + + .p2align 5 + /* Copy 32 bytes at a time, unaligned. + The loop is unrolled 3 times in order to reduce the dependency on + r4 and r10, copying up-to 96 bytes per iteration. */ +L(copy_32): + lxv v10, 0(r4) + lxv v11, 16(r4) + stxv v10, 0(r10) + stxv v11, 16(r10) + bdz L(end_copy_32a) + addi r4,r4,96 + addi r10,r10,96 + + lxv v10, 0(r6) + lxv v11, 16(r6) + addi r6,r6,96 + stxv v10, 0(r7) + stxv v11, 16(r7) + bdz L(end_copy_32b) + addi r7,r7,96 + + lxv v12, 0(r8) + lxv v13, 16(r8) + addi r8,r8,96 + stxv v12, 0(r9) + stxv v13, 16(r9) + addi r9,r9,96 + bdnz L(copy_32) + + clrldi. r5,r5,64-5 /* Have we copied everything? */ + beqlr + cmpdi r5,16 + ble L(copy_le_16) + b L(copy_lt_32) + + .p2align 5 +L(end_copy_32a): + clrldi. r5,r5,64-5 /* Have we copied everything? */ + beqlr + /* 32 bytes have been copied since the last update of r4 and r10. */ + addi r4,r4,32 + addi r10,r10,32 + cmpdi r5,16 + ble L(copy_le_16) + b L(copy_lt_32) + + .p2align 5 +L(end_copy_32b): + clrldi. r5,r5,64-5 /* Have we copied everything? */ + beqlr + /* The last iteration of the loop copied 64 bytes. Update r4 and r10 + accordingly. */ + addi r4,r4,-32 + addi r10,r10,-32 + cmpdi r5,16 + ble L(copy_le_16) + + .p2align 5 +L(copy_lt_32): + lxv v10, 0(r4) + stxv v10, 0(r10) + addi r4,r4,16 + addi r10,r10,16 + subi r5,r5,16 + + .p2align 5 +L(copy_le_16): + sldi r6,r5,56 + lxvl v10,r4,r6 + stxvl v10,r10,r6 + blr + + +END_GEN_TB (MEMCPY,TB_TOCLESS) +libc_hidden_builtin_def (memcpy) diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index a82219c490..9ef12d3563 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ strncase-power8 ifneq (,$(filter %le,$(config-machine))) -sysdep_routines += memmove-power10 \ +sysdep_routines += memcpy-power10 memmove-power10 \ strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \ strlen-power10 diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index d00bcc8178..1ab56bb2c9 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -51,6 +51,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, #ifdef SHARED /* Support sysdeps/powerpc/powerpc64/multiarch/memcpy.c. */ IFUNC_IMPL (i, name, memcpy, +#ifdef __LITTLE_ENDIAN__ + IFUNC_IMPL_ADD (array, i, memcpy, + hwcap2 & PPC_FEATURE2_ARCH_3_1 + && hwcap & PPC_FEATURE_HAS_VSX, + __memcpy_power10) +#endif IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07, __memcpy_power8_cached) IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_HAS_VSX, diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power10.S new file mode 100644 index 0000000000..70e0fc3ed6 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power10.S @@ -0,0 +1,26 @@ +/* Optimized memcpy implementation for POWER10. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#if defined __LITTLE_ENDIAN__ && IS_IN (libc) +#define MEMCPY __memcpy_power10 + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include <sysdeps/powerpc/powerpc64/le/power10/memcpy.S> +#endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c index 5733192932..53ab32ef26 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c +++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c @@ -36,8 +36,15 @@ extern __typeof (__redirect_memcpy) __memcpy_power6 attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_a2 attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_power7 attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_power8_cached attribute_hidden; +# if defined __LITTLE_ENDIAN__ +extern __typeof (__redirect_memcpy) __memcpy_power10 attribute_hidden; +# endif libc_ifunc (__libc_memcpy, +# if defined __LITTLE_ENDIAN__ + (hwcap2 & PPC_FEATURE2_ARCH_3_1 && hwcap & PPC_FEATURE_HAS_VSX) + ? __memcpy_power10 : +# endif ((hwcap2 & PPC_FEATURE2_ARCH_2_07) && use_cached_memopt) ? __memcpy_power8_cached : (hwcap & PPC_FEATURE_HAS_VSX) |