aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/aarch64/multiarch/memcpy_thunderx.S
diff options
context:
space:
mode:
authorAnton Youdkevitch <anton.youdkevitch@bell-sw.com>2018-10-16 11:00:27 -0700
committerSteve Ellcey <sellcey@caviumnetworks.com>2018-10-16 11:00:27 -0700
commit75c1aee500ac95bde2b800b3d787c0dd805a8a82 (patch)
tree654659bd639a9d9e6cd3cb9313f7ee8cc03672dc /sysdeps/aarch64/multiarch/memcpy_thunderx.S
parentbcdb1bfa0c700db25e0f355d912ec2309f9544a2 (diff)
downloadglibc-75c1aee500ac95bde2b800b3d787c0dd805a8a82.tar
glibc-75c1aee500ac95bde2b800b3d787c0dd805a8a82.tar.gz
glibc-75c1aee500ac95bde2b800b3d787c0dd805a8a82.tar.bz2
glibc-75c1aee500ac95bde2b800b3d787c0dd805a8a82.zip
aarch64: optimized memcpy implementation for thunderx2
Since aligned loads and stores are huge performance advantage the implementation always tries to do aligned access. Among the cases when src and dst addresses are aligned or unaligned evenly there are cases of not evenly unaligned src and dst. For such cases (if the length is big enough) ext instruction is used to merge-and-shift two memory chunks loaded from two adjacent aligned locations and then the adjusted chunk gets stored to aligned address. Performance gain against the current T2 implementation: memcpy-large: 65K-32M: +40% - +10% memcpy-walk: 128-32M: +20% - +2%
Diffstat (limited to 'sysdeps/aarch64/multiarch/memcpy_thunderx.S')
-rw-r--r--sysdeps/aarch64/multiarch/memcpy_thunderx.S14
1 files changed, 0 insertions, 14 deletions
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
index de494d933d..6000365e82 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
@@ -74,13 +74,10 @@
#if IS_IN (libc)
-# ifndef USE_THUNDERX2
# undef MEMCPY
# define MEMCPY __memcpy_thunderx
# undef MEMMOVE
# define MEMMOVE __memmove_thunderx
-# define USE_THUNDERX
-# endif
ENTRY_ALIGN (MEMMOVE, 6)
@@ -182,8 +179,6 @@ L(copy96):
.p2align 4
L(copy_long):
-# if defined(USE_THUNDERX) || defined (USE_THUNDERX2)
-
/* On thunderx, large memcpy's are helped by software prefetching.
This loop is identical to the one below it but with prefetching
instructions included. For loops that are less than 32768 bytes,
@@ -196,11 +191,7 @@ L(copy_long):
bic dst, dstin, 15
ldp D_l, D_h, [src]
sub src, src, tmp1
-# if defined(USE_THUNDERX)
prfm pldl1strm, [src, 384]
-# elif defined(USE_THUNDERX2)
- prfm pldl1strm, [src, 256]
-# endif
add count, count, tmp1 /* Count is now 16 too large. */
ldp A_l, A_h, [src, 16]
stp D_l, D_h, [dstin]
@@ -210,13 +201,9 @@ L(copy_long):
subs count, count, 128 + 16 /* Test and readjust count. */
L(prefetch_loop64):
-# if defined(USE_THUNDERX)
tbz src, #6, 1f
prfm pldl1strm, [src, 512]
1:
-# elif defined(USE_THUNDERX2)
- prfm pldl1strm, [src, 256]
-# endif
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [src, 16]
stp B_l, B_h, [dst, 32]
@@ -230,7 +217,6 @@ L(prefetch_loop64):
b L(last64)
L(copy_long_without_prefetch):
-# endif
and tmp1, dstin, 15
bic dst, dstin, 15