aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/powerpc/powerpc64/power8/strlen.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power8/strlen.S')
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strlen.S301
1 files changed, 0 insertions, 301 deletions
diff --git a/sysdeps/powerpc/powerpc64/power8/strlen.S b/sysdeps/powerpc/powerpc64/power8/strlen.S
deleted file mode 100644
index 8f4a1fc1dc..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/strlen.S
+++ /dev/null
@@ -1,301 +0,0 @@
-/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized
- loop.
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-/* TODO: change these to the actual instructions when the minimum required
- binutils allows it. */
-#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
-#define VBPERMQ(t,a,b) .long (0x1000054c \
- | ((t)<<(32-11)) \
- | ((a)<<(32-16)) \
- | ((b)<<(32-21)) )
-
-/* int [r3] strlen (char *s [r3]) */
-
-#ifndef STRLEN
-# define STRLEN strlen
-#endif
-
-/* TODO: change this to .machine power8 when the minimum required binutils
- allows it. */
- .machine power7
-EALIGN (STRLEN, 4, 0)
- CALL_MCOUNT 1
- dcbt 0,r3
- clrrdi r4,r3,3 /* Align the address to doubleword boundary. */
- rlwinm r6,r3,3,26,28 /* Calculate padding. */
- li r0,0 /* Doubleword with null chars to use
- with cmpb. */
- li r5,-1 /* MASK = 0xffffffffffffffff. */
- ld r12,0(r4) /* Load doubleword from memory. */
-#ifdef __LITTLE_ENDIAN__
- sld r5,r5,r6
-#else
- srd r5,r5,r6 /* MASK = MASK >> padding. */
-#endif
- orc r9,r12,r5 /* Mask bits that are not part of the string. */
- cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */
- cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
- bne cr7,L(done)
-
- /* For shorter strings (< 64 bytes), we will not use vector registers,
- as the overhead isn't worth it. So, let's use GPRs instead. This
- will be done the same way as we do in the POWER7 implementation.
- Let's see if we are aligned to a quadword boundary. If so, we can
- jump to the first (non-vectorized) loop. Otherwise, we have to
- handle the next DWORD first. */
- mtcrf 0x01,r4
- mr r9,r4
- addi r9,r9,8
- bt 28,L(align64)
-
- /* Handle the next 8 bytes so we are aligned to a quadword
- boundary. */
- ldu r5,8(r4)
- cmpb r10,r5,r0
- cmpdi cr7,r10,0
- addi r9,r9,8
- bne cr7,L(done)
-
-L(align64):
- /* Proceed to the old (POWER7) implementation, checking two doublewords
- per iteraction. For the first 56 bytes, we will just check for null
- characters. After that, we will also check if we are 64-byte aligned
- so we can jump to the vectorized implementation. We will unroll
- these loops to avoid excessive branching. */
- ld r6,8(r4)
- ldu r5,16(r4)
- cmpb r10,r6,r0
- cmpb r11,r5,r0
- or r5,r10,r11
- cmpdi cr7,r5,0
- addi r9,r9,16
- bne cr7,L(dword_zero)
-
- ld r6,8(r4)
- ldu r5,16(r4)
- cmpb r10,r6,r0
- cmpb r11,r5,r0
- or r5,r10,r11
- cmpdi cr7,r5,0
- addi r9,r9,16
- bne cr7,L(dword_zero)
-
- ld r6,8(r4)
- ldu r5,16(r4)
- cmpb r10,r6,r0
- cmpb r11,r5,r0
- or r5,r10,r11
- cmpdi cr7,r5,0
- addi r9,r9,16
- bne cr7,L(dword_zero)
-
- /* Are we 64-byte aligned? If so, jump to the vectorized loop.
- Note: aligning to 64-byte will necessarily slow down performance for
- strings around 64 bytes in length due to the extra comparisons
- required to check alignment for the vectorized loop. This is a
- necessary tradeoff we are willing to take in order to speed up the
- calculation for larger strings. */
- andi. r10,r9,63
- beq cr0,L(preloop)
- ld r6,8(r4)
- ldu r5,16(r4)
- cmpb r10,r6,r0
- cmpb r11,r5,r0
- or r5,r10,r11
- cmpdi cr7,r5,0
- addi r9,r9,16
- bne cr7,L(dword_zero)
-
- andi. r10,r9,63
- beq cr0,L(preloop)
- ld r6,8(r4)
- ldu r5,16(r4)
- cmpb r10,r6,r0
- cmpb r11,r5,r0
- or r5,r10,r11
- cmpdi cr7,r5,0
- addi r9,r9,16
- bne cr7,L(dword_zero)
-
- andi. r10,r9,63
- beq cr0,L(preloop)
- ld r6,8(r4)
- ldu r5,16(r4)
- cmpb r10,r6,r0
- cmpb r11,r5,r0
- or r5,r10,r11
- cmpdi cr7,r5,0
- addi r9,r9,16
- bne cr7,L(dword_zero)
-
- andi. r10,r9,63
- beq cr0,L(preloop)
- ld r6,8(r4)
- ldu r5,16(r4)
- cmpb r10,r6,r0
- cmpb r11,r5,r0
- or r5,r10,r11
- cmpdi cr7,r5,0
- addi r9,r9,16
-
- /* At this point, we are necessarily 64-byte aligned. If no zeroes were
- found, jump to the vectorized loop. */
- beq cr7,L(preloop)
-
-L(dword_zero):
- /* OK, one (or both) of the doublewords contains a null byte. Check
- the first doubleword and decrement the address in case the first
- doubleword really contains a null byte. */
-
- cmpdi cr6,r10,0
- addi r4,r4,-8
- bne cr6,L(done)
-
- /* The null byte must be in the second doubleword. Adjust the address
- again and move the result of cmpb to r10 so we can calculate the
- length. */
-
- mr r10,r11
- addi r4,r4,8
-
- /* If the null byte was found in the non-vectorized code, compute the
- final length. r10 has the output of the cmpb instruction, that is,
- it contains 0xff in the same position as the null byte in the
- original doubleword from the string. Use that to calculate the
- length. */
-L(done):
-#ifdef __LITTLE_ENDIAN__
- addi r9, r10,-1 /* Form a mask from trailing zeros. */
- andc r9, r9,r10
- popcntd r0, r9 /* Count the bits in the mask. */
-#else
- cntlzd r0,r10 /* Count leading zeros before the match. */
-#endif
- subf r5,r3,r4
- srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
- add r3,r5,r0 /* Compute final length. */
- blr
-
- /* Vectorized implementation starts here. */
- .p2align 4
-L(preloop):
- /* Set up for the loop. */
- mr r4,r9
- li r7, 16 /* Load required offsets. */
- li r8, 32
- li r9, 48
- li r12, 8
- vxor v0,v0,v0 /* VR with null chars to use with
- vcmpequb. */
-
- /* Main loop to look for the end of the string. We will read in
- 64-byte chunks. Align it to 32 bytes and unroll it 3 times to
- leverage the icache performance. */
- .p2align 5
-L(loop):
- lvx v1,r4,r0 /* Load 4 quadwords. */
- lvx v2,r4,r7
- lvx v3,r4,r8
- lvx v4,r4,r9
- vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
- vminub v6,v3,v4
- vminub v7,v5,v6
- vcmpequb. v7,v7,v0 /* Check for NULLs. */
- addi r4,r4,64 /* Adjust address for the next iteration. */
- bne cr6,L(vmx_zero)
-
- lvx v1,r4,r0 /* Load 4 quadwords. */
- lvx v2,r4,r7
- lvx v3,r4,r8
- lvx v4,r4,r9
- vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
- vminub v6,v3,v4
- vminub v7,v5,v6
- vcmpequb. v7,v7,v0 /* Check for NULLs. */
- addi r4,r4,64 /* Adjust address for the next iteration. */
- bne cr6,L(vmx_zero)
-
- lvx v1,r4,r0 /* Load 4 quadwords. */
- lvx v2,r4,r7
- lvx v3,r4,r8
- lvx v4,r4,r9
- vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
- vminub v6,v3,v4
- vminub v7,v5,v6
- vcmpequb. v7,v7,v0 /* Check for NULLs. */
- addi r4,r4,64 /* Adjust address for the next iteration. */
- beq cr6,L(loop)
-
-L(vmx_zero):
- /* OK, we found a null byte. Let's look for it in the current 64-byte
- block and mark it in its corresponding VR. */
- vcmpequb v1,v1,v0
- vcmpequb v2,v2,v0
- vcmpequb v3,v3,v0
- vcmpequb v4,v4,v0
-
- /* We will now 'compress' the result into a single doubleword, so it
- can be moved to a GPR for the final calculation. First, we
- generate an appropriate mask for vbpermq, so we can permute bits into
- the first halfword. */
- vspltisb v10,3
- lvsl v11,r0,r0
- vslb v10,v11,v10
-
- /* Permute the first bit of each byte into bits 48-63. */
- VBPERMQ(v1,v1,v10)
- VBPERMQ(v2,v2,v10)
- VBPERMQ(v3,v3,v10)
- VBPERMQ(v4,v4,v10)
-
- /* Shift each component into its correct position for merging. */
-#ifdef __LITTLE_ENDIAN__
- vsldoi v2,v2,v2,2
- vsldoi v3,v3,v3,4
- vsldoi v4,v4,v4,6
-#else
- vsldoi v1,v1,v1,6
- vsldoi v2,v2,v2,4
- vsldoi v3,v3,v3,2
-#endif
-
- /* Merge the results and move to a GPR. */
- vor v1,v2,v1
- vor v2,v3,v4
- vor v4,v1,v2
- MFVRD(r10,v4)
-
- /* Adjust address to the begninning of the current 64-byte block. */
- addi r4,r4,-64
-
-#ifdef __LITTLE_ENDIAN__
- addi r9, r10,-1 /* Form a mask from trailing zeros. */
- andc r9, r9,r10
- popcntd r0, r9 /* Count the bits in the mask. */
-#else
- cntlzd r0,r10 /* Count leading zeros before the match. */
-#endif
- subf r5,r3,r4
- add r3,r5,r0 /* Compute final length. */
- blr
-
-END (STRLEN)
-libc_hidden_builtin_def (strlen)