aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWilco Dijkstra <wdijkstr@arm.com>2018-11-20 12:37:00 +0000
committerWilco Dijkstra <wdijkstr@arm.com>2018-11-20 12:37:00 +0000
commit5770c0ad1e0c784e817464ca2cf9436a58c9beb7 (patch)
tree6616d15f2d44823b4c70b0fe607b4c7927fe45ac
parent9a62a9397d0a25643922d8d053f04ee895100d9a (diff)
downloadglibc-5770c0ad1e0c784e817464ca2cf9436a58c9beb7.tar
glibc-5770c0ad1e0c784e817464ca2cf9436a58c9beb7.tar.gz
glibc-5770c0ad1e0c784e817464ca2cf9436a58c9beb7.tar.bz2
glibc-5770c0ad1e0c784e817464ca2cf9436a58c9beb7.zip
[AArch64] Adjust writeback in non-zero memset
This fixes an ineffiency in the non-zero memset. Delaying the writeback until the end of the loop is slightly faster on some cores - this shows ~5% performance gain on Cortex-A53 when doing large non-zero memsets. * sysdeps/aarch64/memset.S (MEMSET): Improve non-zero memset loop.
-rw-r--r--ChangeLog4
-rw-r--r--sysdeps/aarch64/memset.S7
2 files changed, 8 insertions, 3 deletions
diff --git a/ChangeLog b/ChangeLog
index d340866c43..be2344248f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2018-11-20 Wilco Dijkstra <wdijkstr@arm.com>
+
+ * sysdeps/aarch64/memset.S (MEMSET): Improve non-zero memset loop.
+
2018-11-20 Joseph Myers <joseph@codesourcery.com>
* conform/conformtest.py (ElementTest.run): Use unique identifiers
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index 4a45459361..9738cf5fd5 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -89,10 +89,10 @@ L(set_long):
b.eq L(try_zva)
L(no_zva):
sub count, dstend, dst /* Count is 16 too large. */
- add dst, dst, 16
+ sub dst, dst, 16 /* Dst is biased by -32. */
sub count, count, 64 + 16 /* Adjust count and bias for loop. */
-1: stp q0, q0, [dst], 64
- stp q0, q0, [dst, -32]
+1: stp q0, q0, [dst, 32]
+ stp q0, q0, [dst, 64]!
L(tail64):
subs count, count, 64
b.hi 1b
@@ -183,6 +183,7 @@ L(zva_other):
subs count, count, zva_len
b.hs 3b
4: add count, count, zva_len
+ sub dst, dst, 32 /* Bias dst for tail loop. */
b L(tail64)
#endif