aboutsummaryrefslogtreecommitdiff
path: root/sysdeps
diff options
context:
space:
mode:
authorRajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>2017-02-07 10:40:26 +0530
committerRajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>2017-02-07 10:40:26 +0530
commit04f0fd640d78ff715cb9409f03d55366ed76966e (patch)
treedfef9d58915c0a6788b1c76524949a8ed5546e5f /sysdeps
parentdcd4cd575678c94f042c1a6b08663c648781a9eb (diff)
downloadglibc-04f0fd640d78ff715cb9409f03d55366ed76966e.tar
glibc-04f0fd640d78ff715cb9409f03d55366ed76966e.tar.gz
glibc-04f0fd640d78ff715cb9409f03d55366ed76966e.tar.bz2
glibc-04f0fd640d78ff715cb9409f03d55366ed76966e.zip
powerpc: Improve strcmp performance for shorter strings
For strings >16B and <32B existing algorithm takes more time than default implementation when strings are placed closed to end of page. This is due to byte by byte access for handling page cross. This is improved by following >32B code path where the address is adjusted to aligned memory before doing load doubleword operation instead of loading bytes. Tested on powerpc64 and powerpc64le.
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strcmp.S30
-rw-r--r--sysdeps/powerpc/powerpc64/power9/strcmp.S30
2 files changed, 16 insertions, 44 deletions
diff --git a/sysdeps/powerpc/powerpc64/power8/strcmp.S b/sysdeps/powerpc/powerpc64/power8/strcmp.S
index c34ff4a23b..d46bff80cd 100644
--- a/sysdeps/powerpc/powerpc64/power8/strcmp.S
+++ b/sysdeps/powerpc/powerpc64/power8/strcmp.S
@@ -30,21 +30,21 @@
EALIGN (strcmp, 4, 0)
li r0,0
- /* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
+ /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
the code:
(((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
- with PAGE_SIZE being 4096 and ITER_SIZE begin 32. */
+ with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */
rldicl r7,r3,0,52
rldicl r9,r4,0,52
- cmpldi cr7,r7,4096-32
+ cmpldi cr7,r7,4096-16
bgt cr7,L(pagecross_check)
- cmpldi cr5,r9,4096-32
+ cmpldi cr5,r9,4096-16
bgt cr5,L(pagecross_check)
- /* For short string up to 32 bytes, load both s1 and s2 using
+ /* For short string up to 16 bytes, load both s1 and s2 using
unaligned dwords and compare. */
ld r8,0(r3)
ld r10,0(r4)
@@ -60,25 +60,11 @@ EALIGN (strcmp, 4, 0)
orc. r9,r12,r11
bne cr0,L(different_nocmpb)
- ld r8,16(r3)
- ld r10,16(r4)
- cmpb r12,r8,r0
- cmpb r11,r8,r10
- orc. r9,r12,r11
- bne cr0,L(different_nocmpb)
-
- ld r8,24(r3)
- ld r10,24(r4)
- cmpb r12,r8,r0
- cmpb r11,r8,r10
- orc. r9,r12,r11
- bne cr0,L(different_nocmpb)
-
- addi r7,r3,32
- addi r4,r4,32
+ addi r7,r3,16
+ addi r4,r4,16
L(align_8b):
- /* Now it has checked for first 32 bytes, align source1 to doubleword
+ /* Now it has checked for first 16 bytes, align source1 to doubleword
and adjust source2 address. */
rldicl r9,r7,0,61 /* source1 alignment to doubleword */
subf r4,r9,r4 /* Adjust source2 address based on source1
diff --git a/sysdeps/powerpc/powerpc64/power9/strcmp.S b/sysdeps/powerpc/powerpc64/power9/strcmp.S
index 3e32396c94..17ec8c24c3 100644
--- a/sysdeps/powerpc/powerpc64/power9/strcmp.S
+++ b/sysdeps/powerpc/powerpc64/power9/strcmp.S
@@ -65,21 +65,21 @@
EALIGN (strcmp, 4, 0)
li r0, 0
- /* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
+ /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
the code:
(((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
- with PAGE_SIZE being 4096 and ITER_SIZE begin 32. */
+ with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */
rldicl r7, r3, 0, 52
rldicl r9, r4, 0, 52
- cmpldi cr7, r7, 4096-32
+ cmpldi cr7, r7, 4096-16
bgt cr7, L(pagecross_check)
- cmpldi cr5, r9, 4096-32
+ cmpldi cr5, r9, 4096-16
bgt cr5, L(pagecross_check)
- /* For short strings up to 32 bytes, load both s1 and s2 using
+ /* For short strings up to 16 bytes, load both s1 and s2 using
unaligned dwords and compare. */
ld r8, 0(r3)
ld r10, 0(r4)
@@ -95,25 +95,11 @@ EALIGN (strcmp, 4, 0)
orc. r9, r12, r11
bne cr0, L(different_nocmpb)
- ld r8, 16(r3)
- ld r10, 16(r4)
- cmpb r12, r8, r0
- cmpb r11, r8, r10
- orc. r9, r12, r11
- bne cr0, L(different_nocmpb)
-
- ld r8, 24(r3)
- ld r10, 24(r4)
- cmpb r12, r8, r0
- cmpb r11, r8, r10
- orc. r9, r12, r11
- bne cr0, L(different_nocmpb)
-
- addi r7, r3, 32
- addi r4, r4, 32
+ addi r7, r3, 16
+ addi r4, r4, 16
L(align):
- /* Now it has checked for first 32 bytes. */
+ /* Now it has checked for first 16 bytes. */
vspltisb v0, 0
vspltisb v2, -1
lvsr v6, 0, r4 /* Compute mask. */