aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorXuelei Zhang <zhangxuelei4@huawei.com>2019-12-19 13:08:11 +0000
committerAdhemerval Zanella <adhemerval.zanella@linaro.org>2019-12-19 16:31:04 -0300
commit0237b61526e716fa9597f521643908a4fda3b46a (patch)
treebac85f002ba42d2039bbb7a7af5ac7ba3c985f3a
parent233efd433d847e69480fe587c4c29a32fe554174 (diff)
downloadglibc-0237b61526e716fa9597f521643908a4fda3b46a.tar
glibc-0237b61526e716fa9597f521643908a4fda3b46a.tar.gz
glibc-0237b61526e716fa9597f521643908a4fda3b46a.tar.bz2
glibc-0237b61526e716fa9597f521643908a4fda3b46a.zip
aarch64: Optimized implementation of strcpy
Optimize the strcpy implementation by using vector loads and operations in main loop.Compared to aarch64/strcpy.S, it reduces latency of cases in bench-strlen by 5%~18% when the length of src is greater than 64 bytes, with gains throughout the benchmark. Checked on aarch64-linux-gnu. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
-rw-r--r--sysdeps/aarch64/strcpy.S59
1 files changed, 27 insertions, 32 deletions
diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
index edc16252f6..791644c0b2 100644
--- a/sysdeps/aarch64/strcpy.S
+++ b/sysdeps/aarch64/strcpy.S
@@ -53,6 +53,12 @@
#define len x16
#define to_align x17
+/* NEON register */
+#define dataq q2
+#define datav v2
+#define datab2 b3
+#define datav2 v3
+
#ifdef BUILD_STPCPY
#define STRCPY __stpcpy
#else
@@ -199,7 +205,6 @@ L(fp_lt2):
#endif
ret
- .p2align 6
/* Aligning here ensures that the entry code and main loop all lies
within one 64-byte cache line. */
L(bulk_entry):
@@ -214,46 +219,36 @@ L(bulk_entry):
especially on cores with a high number of issue slots per
cycle, as we get much better parallelism out of the operations. */
L(main_loop):
- stp data1, data2, [dst], #16
+ str dataq, [dst], #16
L(entry_no_page_cross):
- ldp data1, data2, [src], #16
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, #REP8_7f
- bic has_nul1, tmp1, tmp2
- bics has_nul2, tmp3, tmp4
- ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
- b.eq L(main_loop)
+ ldr dataq, [src], #16
+ uminv datab2, datav.16b
+ mov tmp3, datav2.d[0]
+ cbnz tmp3, L(main_loop)
/* Since we know we are copying at least 16 bytes, the fastest way
to deal with the tail is to determine the location of the
trailing NUL, then (re)copy the 16 bytes leading up to that. */
- cmp has_nul1, #0
#ifdef __AARCH64EB__
- /* For big-endian, carry propagation (if the final byte in the
- string is 0x01) means we cannot use has_nul directly. The
- easiest way to get the correct byte is to byte-swap the data
- and calculate the syndrome a second time. */
+ rev64 datav.16b, datav.16b
+#endif
+ /* ���loc */
+ cmeq datav.16b, datav.16b, #0
+ mov data1, datav.d[0]
+ mov data2, datav.d[1]
+ cmp data1, 0
csel data1, data1, data2, ne
+ mov pos, 8
rev data1, data1
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- bic has_nul1, tmp1, tmp2
-#else
- csel has_nul1, has_nul1, has_nul2, ne
-#endif
- rev has_nul1, has_nul1
- clz pos, has_nul1
- add tmp1, pos, #72
- add pos, pos, #8
- csel pos, pos, tmp1, ne
- add src, src, pos, lsr #3
- add dst, dst, pos, lsr #3
- ldp data1, data2, [src, #-32]
- stp data1, data2, [dst, #-16]
+ clz tmp1, data1
+ csel pos, xzr, pos, ne
+ add pos, pos, tmp1, lsr 3
+ add src, src, pos
+ add dst, dst, pos
+ ldr dataq,[src, #-31]
+ str dataq,[dst, #-15]
#ifdef BUILD_STPCPY
- sub dstin, dst, #1
+ mov dstin, dst
#endif
ret