aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/powerpc/powerpc64/le/power10/strcmp.S')
-rw-r--r--sysdeps/powerpc/powerpc64/le/power10/strcmp.S161
1 files changed, 95 insertions, 66 deletions
diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
index a3c1adad53..3406f4f26a 100644
--- a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
+++ b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
@@ -62,7 +62,7 @@
lxvl 32+v5,reg2,r0; \
add reg1,reg1,len_reg; \
add reg2,reg2,len_reg; \
- vcmpnezb. v7,v4,v5; \
+ vcmpnezb v7,v4,v5; \
vctzlsbb r6,v7; \
cmpld cr7,r6,len_reg; \
blt cr7,L(different); \
@@ -72,70 +72,110 @@
.machine power9
ENTRY_TOCLESS (STRCMP, 4)
- li r11,16
- /* eq bit of cr1 used as swap status flag to indicate if
- source pointers were swapped. */
- crclr 4*cr1+eq
- vspltisb v19,-1
- andi. r7,r3,15
- sub r7,r11,r7 /* r7(nalign1) = 16 - (str1 & 15). */
- andi. r9,r4,15
- sub r5,r11,r9 /* r5(nalign2) = 16 - (str2 & 15). */
- cmpld cr7,r7,r5
- beq cr7,L(same_aligned)
- blt cr7,L(nalign1_min)
- /* Swap r3 and r4, and r7 and r5 such that r3 and r7 hold the
- pointer which is closer to the next 16B boundary so that only
- one CHECK_N_BYTES is needed before entering the loop below. */
- mr r8,r4
- mr r4,r3
- mr r3,r8
- mr r12,r7
- mr r7,r5
- mr r5,r12
- crset 4*cr1+eq /* Set bit on swapping source pointers. */
+ andi. r7,r3,4095
+ andi. r8,r4,4095
+ cmpldi cr0,r7,4096-16
+ cmpldi cr1,r8,4096-16
+ bgt cr0,L(crosses)
+ bgt cr1,L(crosses)
+ COMPARE_16(v4,v5,0)
- .p2align 5
+L(crosses):
+ andi. r7,r3,15
+ subfic r7,r7,16 /* r7(nalign1) = 16 - (str1 & 15). */
+ andi. r9,r4,15
+ subfic r5,r9,16 /* r5(nalign2) = 16 - (str2 & 15). */
+ cmpld cr7,r7,r5
+ beq cr7,L(same_aligned)
+ blt cr7,L(nalign1_min)
+
+ /* nalign2 is minimum and s2 pointer is aligned. */
+ CHECK_N_BYTES(r3,r4,r5)
+ /* Are we on the 64B hunk which crosses a page? */
+ andi. r10,r3,63 /* Determine offset into 64B hunk. */
+ andi. r8,r3,15 /* The offset into the 16B hunk. */
+ neg r7,r3
+ andi. r9,r7,15 /* Number of bytes after a 16B cross. */
+ rlwinm. r7,r7,26,0x3F /* ((r3-4096))>>6&63. */
+ beq L(compare_64_pagecross)
+ mtctr r7
+ b L(compare_64B_unaligned)
+
+ /* nalign1 is minimum and s1 pointer is aligned. */
L(nalign1_min):
CHECK_N_BYTES(r3,r4,r7)
+ /* Are we on the 64B hunk which crosses a page? */
+ andi. r10,r4,63 /* Determine offset into 64B hunk. */
+ andi. r8,r4,15 /* The offset into the 16B hunk. */
+ neg r7,r4
+ andi. r9,r7,15 /* Number of bytes after a 16B cross. */
+ rlwinm. r7,r7,26,0x3F /* ((r4-4096))>>6&63. */
+ beq L(compare_64_pagecross)
+ mtctr r7
.p2align 5
-L(s1_aligned):
- /* r9 and r5 is number of bytes to be read after and before
- page boundary correspondingly. */
- sub r5,r5,r7
- subfic r9,r5,16
- /* Now let r7 hold the count of quadwords which can be
- checked without crossing a page boundary. quadword offset is
- (str2>>4)&0xFF. */
- rlwinm r7,r4,28,0xFF
- /* Below check is required only for first iteration. For second
- iteration and beyond, the new loop counter is always 255. */
- cmpldi r7,255
- beq L(L3)
- /* Get the initial loop count by 255-((str2>>4)&0xFF). */
- subfic r11,r7,255
+L(compare_64B_unaligned):
+ COMPARE_16(v4,v5,0)
+ COMPARE_16(v4,v5,16)
+ COMPARE_16(v4,v5,32)
+ COMPARE_16(v4,v5,48)
+ addi r3,r3,64
+ addi r4,r4,64
+ bdnz L(compare_64B_unaligned)
- .p2align 5
-L(L1):
+ /* Cross the page boundary of s2, carefully. Only for first
+ iteration we have to get the count of 64B blocks to be checked.
+ From second iteration and beyond, loop counter is always 63. */
+L(compare_64_pagecross):
+ li r11, 63
mtctr r11
-
- .p2align 5
-L(L2):
- COMPARE_16(v4,v5,0) /* Load 16B blocks using lxv. */
+ cmpldi r10,16
+ ble L(cross_4)
+ cmpldi r10,32
+ ble L(cross_3)
+ cmpldi r10,48
+ ble L(cross_2)
+L(cross_1):
+ CHECK_N_BYTES(r3,r4,r9)
+ CHECK_N_BYTES(r3,r4,r8)
+ COMPARE_16(v4,v5,0)
+ COMPARE_16(v4,v5,16)
+ COMPARE_16(v4,v5,32)
+ addi r3,r3,48
+ addi r4,r4,48
+ b L(compare_64B_unaligned)
+L(cross_2):
+ COMPARE_16(v4,v5,0)
addi r3,r3,16
addi r4,r4,16
- bdnz L(L2)
- /* Cross the page boundary of s2, carefully. */
-
- .p2align 5
-L(L3):
- CHECK_N_BYTES(r3,r4,r5)
CHECK_N_BYTES(r3,r4,r9)
- li r11,255 /* Load the new loop counter. */
- b L(L1)
+ CHECK_N_BYTES(r3,r4,r8)
+ COMPARE_16(v4,v5,0)
+ COMPARE_16(v4,v5,16)
+ addi r3,r3,32
+ addi r4,r4,32
+ b L(compare_64B_unaligned)
+L(cross_3):
+ COMPARE_16(v4,v5,0)
+ COMPARE_16(v4,v5,16)
+ addi r3,r3,32
+ addi r4,r4,32
+ CHECK_N_BYTES(r3,r4,r9)
+ CHECK_N_BYTES(r3,r4,r8)
+ COMPARE_16(v4,v5,0)
+ addi r3,r3,16
+ addi r4,r4,16
+ b L(compare_64B_unaligned)
+L(cross_4):
+ COMPARE_16(v4,v5,0)
+ COMPARE_16(v4,v5,16)
+ COMPARE_16(v4,v5,32)
+ addi r3,r3,48
+ addi r4,r4,48
+ CHECK_N_BYTES(r3,r4,r9)
+ CHECK_N_BYTES(r3,r4,r8)
+ b L(compare_64B_unaligned)
- .p2align 5
L(same_aligned):
CHECK_N_BYTES(r3,r4,r7)
/* Align s1 to 32B and adjust s2 address.
@@ -168,18 +208,7 @@ L(16B_aligned_loop):
/* Calculate and return the difference. */
L(different):
- vctzlsbb r6,v7
- vextubrx r5,r6,v4
- vextubrx r4,r6,v5
- bt 4*cr1+eq,L(swapped)
- subf r3,r4,r5
- blr
-
- /* If src pointers were swapped, then swap the
- indices and calculate the return value. */
-L(swapped):
- subf r3,r5,r4
- blr
+ TAIL(v4,v5)
.p2align 5
L(32B_aligned_loop):