diff options
author | Anton Youdkevitch <anton.youdkevitch@bell-sw.com> | 2018-10-16 11:00:27 -0700 |
---|---|---|
committer | Steve Ellcey <sellcey@caviumnetworks.com> | 2018-10-16 11:00:27 -0700 |
commit | 75c1aee500ac95bde2b800b3d787c0dd805a8a82 (patch) | |
tree | 654659bd639a9d9e6cd3cb9313f7ee8cc03672dc /sysdeps/aarch64/multiarch/memcpy_thunderx.S | |
parent | bcdb1bfa0c700db25e0f355d912ec2309f9544a2 (diff) | |
download | glibc-75c1aee500ac95bde2b800b3d787c0dd805a8a82.tar glibc-75c1aee500ac95bde2b800b3d787c0dd805a8a82.tar.gz glibc-75c1aee500ac95bde2b800b3d787c0dd805a8a82.tar.bz2 glibc-75c1aee500ac95bde2b800b3d787c0dd805a8a82.zip |
aarch64: optimized memcpy implementation for thunderx2
Since aligned loads and stores are huge performance
advantage the implementation always tries to do aligned
access. Among the cases when src and dst addresses are
aligned or unaligned evenly there are cases of not evenly
unaligned src and dst. For such cases (if the length is
big enough) ext instruction is used to merge-and-shift
two memory chunks loaded from two adjacent aligned
locations and then the adjusted chunk gets stored to
aligned address.
Performance gain against the current T2 implementation:
memcpy-large: 65K-32M: +40% - +10%
memcpy-walk: 128-32M: +20% - +2%
Diffstat (limited to 'sysdeps/aarch64/multiarch/memcpy_thunderx.S')
-rw-r--r-- | sysdeps/aarch64/multiarch/memcpy_thunderx.S | 14 |
1 files changed, 0 insertions, 14 deletions
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S index de494d933d..6000365e82 100644 --- a/sysdeps/aarch64/multiarch/memcpy_thunderx.S +++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S @@ -74,13 +74,10 @@ #if IS_IN (libc) -# ifndef USE_THUNDERX2 # undef MEMCPY # define MEMCPY __memcpy_thunderx # undef MEMMOVE # define MEMMOVE __memmove_thunderx -# define USE_THUNDERX -# endif ENTRY_ALIGN (MEMMOVE, 6) @@ -182,8 +179,6 @@ L(copy96): .p2align 4 L(copy_long): -# if defined(USE_THUNDERX) || defined (USE_THUNDERX2) - /* On thunderx, large memcpy's are helped by software prefetching. This loop is identical to the one below it but with prefetching instructions included. For loops that are less than 32768 bytes, @@ -196,11 +191,7 @@ L(copy_long): bic dst, dstin, 15 ldp D_l, D_h, [src] sub src, src, tmp1 -# if defined(USE_THUNDERX) prfm pldl1strm, [src, 384] -# elif defined(USE_THUNDERX2) - prfm pldl1strm, [src, 256] -# endif add count, count, tmp1 /* Count is now 16 too large. */ ldp A_l, A_h, [src, 16] stp D_l, D_h, [dstin] @@ -210,13 +201,9 @@ L(copy_long): subs count, count, 128 + 16 /* Test and readjust count. */ L(prefetch_loop64): -# if defined(USE_THUNDERX) tbz src, #6, 1f prfm pldl1strm, [src, 512] 1: -# elif defined(USE_THUNDERX2) - prfm pldl1strm, [src, 256] -# endif stp A_l, A_h, [dst, 16] ldp A_l, A_h, [src, 16] stp B_l, B_h, [dst, 32] @@ -230,7 +217,6 @@ L(prefetch_loop64): b L(last64) L(copy_long_without_prefetch): -# endif and tmp1, dstin, 15 bic dst, dstin, 15 |