aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/powerpc/powerpc64/power8/strncpy.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power8/strncpy.S')
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strncpy.S465
1 files changed, 0 insertions, 465 deletions
diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
deleted file mode 100644
index 6d40f30ff7..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/strncpy.S
+++ /dev/null
@@ -1,465 +0,0 @@
-/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8.
- Copyright (C) 2015-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#ifdef USE_AS_STPNCPY
-# ifndef STPNCPY
-# define FUNC_NAME __stpncpy
-# else
-# define FUNC_NAME STPNCPY
-# endif
-#else
-# ifndef STRNCPY
-# define FUNC_NAME strncpy
-# else
-# define FUNC_NAME STRNCPY
-# endif
-#endif /* !USE_AS_STPNCPY */
-
-#ifndef MEMSET
-/* For builds without IFUNC support, local calls should be made to internal
- GLIBC symbol (created by libc_hidden_builtin_def). */
-# ifdef SHARED
-# define MEMSET __GI_memset
-# else
-# define MEMSET memset
-# endif
-#endif
-
-#define FRAMESIZE (FRAME_MIN_SIZE+48)
-
-/* Implements the function
-
- char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
-
- or
-
- char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
-
- if USE_AS_STPCPY is defined.
-
- The implementation uses unaligned doubleword access to avoid specialized
- code paths depending of data alignment. Although recent powerpc64 uses
- 64K as default, the page cross handling assumes minimum page size of
- 4k. */
-
- .machine power7
-EALIGN (FUNC_NAME, 4, 0)
-
- /* Check if the [src]+15 will cross a 4K page by checking if the bit
- indicating the page size changes. Basically:
-
- uint64_t srcin = (uint64_t)src;
- uint64_t ob = srcin & 4096UL;
- uint64_t nb = (srcin+15UL) & 4096UL;
- if (ob ^ nb)
- goto pagecross; */
-
- addi r10,r4,16
- rlwinm r9,r4,0,19,19
-
- /* Save some non-volatile registers on the stack. */
- std r26,-48(r1)
- std r27,-40(r1)
-
- rlwinm r8,r10,0,19,19
-
- std r28,-32(r1)
- std r29,-24(r1)
-
- cmpld cr7,r9,r8
-
- std r30,-16(r1)
- std r31,-8(r1)
-
- /* Update CFI. */
- cfi_offset(r26, -48)
- cfi_offset(r27, -40)
- cfi_offset(r28, -32)
- cfi_offset(r29, -24)
- cfi_offset(r30, -16)
- cfi_offset(r31, -8)
-
- beq cr7,L(unaligned_lt_16)
- rldicl r9,r4,0,61
- subfic r8,r9,8
- cmpld cr7,r5,r8
- bgt cr7,L(pagecross)
-
- /* At this points there is 1 to 15 bytes to check and write. Since it could
- be either from first unaligned 16 bytes access or from bulk copy, the code
- uses an unrolled byte read/write instead of trying to analyze the cmpb
- results. */
-L(short_path):
- mr r9,r3
-L(short_path_1):
- /* Return if there are no more bytes to be written. */
- cmpdi cr7,r5,0
- beq cr7,L(short_path_loop_end_1)
-L(short_path_2):
- /* Copy one char from src (r4) and write it to dest (r9). If it is the
- end-of-string, start the null padding. Continue, otherwise. */
- lbz r10,0(r4)
- cmpdi cr7,r10,0
- stb r10,0(r9)
- beq cr7,L(zero_pad_start_1)
- /* If there are no more bytes to be written, return. */
- cmpdi cr0,r5,1
- addi r8,r9,1
- addi r6,r5,-1
- beq cr0,L(short_path_loop_end_0)
- /* Copy another char from src (r4) to dest (r9). Check again if it is
- the end-of-string. If so, start the null padding. */
- lbz r10,1(r4)
- cmpdi cr7,r10,0
- stb r10,1(r9)
- beq cr7,L(zero_pad_start_prepare_1)
- /* Eagerly decrement r5 by 3, which is the number of bytes already
- written, plus one write that will be performed later on. */
- addi r10,r5,-3
- b L(short_path_loop_1)
-
- .align 4
-L(short_path_loop):
- /* At this point, the induction variable, r5, as well as the pointers
- to dest and src (r9 and r4, respectivelly) have been updated.
-
- Note: The registers r7 and r10 are induction variables derived from
- r5. They are used to determine if the total number of writes has
- been reached at every other write.
-
- Copy one char from src (r4) and write it to dest (r9). If it is the
- end-of-string, start the null padding. Continue, otherwise. */
- lbz r8,0(r4)
- addi r7,r10,-2
- cmpdi cr5,r8,0
- stb r8,0(r9)
- beq cr5,L(zero_pad_start_1)
- beq cr7,L(short_path_loop_end_0)
- /* Copy another char from src (r4) to dest (r9). Check again if it is
- the end-of-string. If so, start the null padding. */
- lbz r8,1(r4)
- cmpdi cr7,r8,0
- stb r8,1(r9)
- beq cr7,L(zero_pad_start)
- mr r10,r7
-L(short_path_loop_1):
- /* This block is reached after two chars have been already written to
- dest. Nevertheless, r5 (the induction variable), r9 (the pointer to
- dest), and r4 (the pointer to src) have not yet been updated.
-
- At this point:
- r5 holds the count of bytes yet to be written plus 2.
- r9 points to the last two chars that were already written to dest.
- r4 points to the last two chars that were already copied from src.
-
- The algorithm continues by decrementing r5, the induction variable,
- so that it reflects the last two writes. The pointers to dest (r9)
- and to src (r4) are increment by two, for the same reason.
-
- Note: Register r10 is another induction variable, derived from r5,
- which determines if the total number of writes has been reached. */
- addic. r5,r5,-2
- addi r9,r9,2
- cmpdi cr7,r10,0 /* Eagerly check if the next write is the last. */
- addi r4,r4,2
- addi r6,r9,1
- bne cr0,L(short_path_loop) /* Check if the total number of writes
- has been reached at every other
- write. */
-#ifdef USE_AS_STPNCPY
- mr r3,r9
- b L(short_path_loop_end)
-#endif
-
-L(short_path_loop_end_0):
-#ifdef USE_AS_STPNCPY
- addi r3,r9,1
- b L(short_path_loop_end)
-#endif
-L(short_path_loop_end_1):
-#ifdef USE_AS_STPNCPY
- mr r3,r9
-#endif
-L(short_path_loop_end):
- /* Restore non-volatile registers. */
- ld r26,-48(r1)
- ld r27,-40(r1)
- ld r28,-32(r1)
- ld r29,-24(r1)
- ld r30,-16(r1)
- ld r31,-8(r1)
- blr
-
- /* This code pads the remainder of dest with NULL bytes. The algorithm
- calculates the remaining size and calls memset. */
- .align 4
-L(zero_pad_start):
- mr r5,r10
- mr r9,r6
-L(zero_pad_start_1):
- /* At this point:
- - r5 holds the number of bytes that still have to be written to
- dest.
- - r9 points to the position, in dest, where the first null byte
- will be written.
- The above statements are true both when control reaches this label
- from a branch or when falling through the previous lines. */
-#ifndef USE_AS_STPNCPY
- mr r30,r3 /* Save the return value of strncpy. */
-#endif
- /* Prepare the call to memset. */
- mr r3,r9 /* Pointer to the area to be zero-filled. */
- li r4,0 /* Byte to be written (zero). */
-
- /* We delayed the creation of the stack frame, as well as the saving of
- the link register, because only at this point, we are sure that
- doing so is actually needed. */
-
- /* Save the link register. */
- mflr r0
- std r0,16(r1)
- cfi_offset(lr, 16)
-
- /* Create the stack frame. */
- stdu r1,-FRAMESIZE(r1)
- cfi_adjust_cfa_offset(FRAMESIZE)
-
- bl MEMSET
- nop
-
- /* Restore the stack frame. */
- addi r1,r1,FRAMESIZE
- cfi_adjust_cfa_offset(-FRAMESIZE)
- /* Restore the link register. */
- ld r0,16(r1)
- mtlr r0
-
-#ifndef USE_AS_STPNCPY
- mr r3,r30 /* Restore the return value of strncpy, i.e.:
- dest. For stpncpy, the return value is the
- same as return value of memset. */
-#endif
-
- /* Restore non-volatile registers and return. */
- ld r26,-48(r1)
- ld r27,-40(r1)
- ld r28,-32(r1)
- ld r29,-24(r1)
- ld r30,-16(r1)
- ld r31,-8(r1)
- blr
-
- /* The common case where [src]+16 will not cross a 4K page boundary.
- In this case the code fast check the first 16 bytes by using doubleword
- read/compares and update destiny if neither total size or null byte
- is found in destiny. */
- .align 4
-L(unaligned_lt_16):
- cmpldi cr7,r5,7
- ble cr7,L(short_path)
- ld r7,0(r4)
- li r8,0
- cmpb r8,r7,r8
- cmpdi cr7,r8,0
- bne cr7,L(short_path_prepare_2)
- addi r6,r5,-8
- std r7,0(r3)
- addi r9,r3,8
- cmpldi cr7,r6,7
- addi r7,r4,8
- ble cr7,L(short_path_prepare_1_1)
- ld r4,8(r4)
- cmpb r8,r4,r8
- cmpdi cr7,r8,0
- bne cr7,L(short_path_prepare_2_1)
- std r4,8(r3)
- addi r29,r3,16
- addi r5,r5,-16
- /* Neither the null byte was found or total length was reached,
- align to 16 bytes and issue a bulk copy/compare. */
- b L(align_to_16b)
-
- /* In the case of 4k page boundary cross, the algorithm first align
- the address to a doubleword, calculate a mask based on alignment
- to ignore the bytes and continue using doubleword. */
- .align 4
-L(pagecross):
- rldicr r11,r4,0,59 /* Align the address to 8 bytes boundary. */
- li r6,-1 /* MASK = 0xffffffffffffffffUL. */
- sldi r9,r9,3 /* Calculate padding. */
- ld r7,0(r11) /* Load doubleword from memory. */
-#ifdef __LITTLE_ENDIAN__
- sld r9,r6,r9 /* MASK = MASK << padding. */
-#else
- srd r9,r6,r9 /* MASK = MASK >> padding. */
-#endif
- orc r9,r7,r9 /* Mask bits that are not part of the
- string. */
- li r7,0
- cmpb r9,r9,r7 /* Check for null bytes in DWORD1. */
- cmpdi cr7,r9,0
- bne cr7,L(short_path_prepare_2)
- subf r8,r8,r5 /* Adjust total length. */
- cmpldi cr7,r8,8 /* Check if length was reached. */
- ble cr7,L(short_path_prepare_2)
-
- /* For next checks we have aligned address, so we check for more
- three doublewords to make sure we can read 16 unaligned bytes
- to start the bulk copy with 16 aligned addresses. */
- ld r7,8(r11)
- cmpb r9,r7,r9
- cmpdi cr7,r9,0
- bne cr7,L(short_path_prepare_2)
- addi r7,r8,-8
- cmpldi cr7,r7,8
- ble cr7,L(short_path_prepare_2)
- ld r7,16(r11)
- cmpb r9,r7,r9
- cmpdi cr7,r9,0
- bne cr7,L(short_path_prepare_2)
- addi r8,r8,-16
- cmpldi cr7,r8,8
- ble cr7,L(short_path_prepare_2)
- ld r8,24(r11)
- cmpb r9,r8,r9
- cmpdi cr7,r9,0
- bne cr7,L(short_path_prepare_2)
-
- /* No null byte found in the 32 bytes readed and length not reached,
- read source again using unaligned loads and store them. */
- ld r9,0(r4)
- addi r29,r3,16
- addi r5,r5,-16
- std r9,0(r3)
- ld r9,8(r4)
- std r9,8(r3)
-
- /* Align source to 16 bytes and adjust destiny and size. */
-L(align_to_16b):
- rldicl r9,r10,0,60
- rldicr r28,r10,0,59
- add r12,r5,r9
- subf r29,r9,r29
-
- /* The bulk read/compare/copy loads two doublewords, compare and merge
- in a single register for speed. This is an attempt to speed up the
- null-checking process for bigger strings. */
-
- cmpldi cr7,r12,15
- ble cr7,L(short_path_prepare_1_2)
-
- /* Main loop for large sizes, unrolled 2 times to get better use of
- pipeline. */
- ld r8,0(28)
- ld r10,8(28)
- li r9,0
- cmpb r7,r8,r9
- cmpb r9,r10,r9
- or. r6,r9,r7
- bne cr0,L(short_path_prepare_2_3)
- addi r5,r12,-16
- addi r4,r28,16
- std r8,0(r29)
- std r10,8(r29)
- cmpldi cr7,r5,15
- addi r9,r29,16
- ble cr7,L(short_path_1)
- mr r11,r28
- mr r6,r29
- li r30,0
- subfic r26,r4,48
- subfic r27,r9,48
-
- b L(loop_16b)
-
- .align 4
-L(loop_start):
- ld r31,0(r11)
- ld r10,8(r11)
- cmpb r0,r31,r7
- cmpb r8,r10,r7
- or. r7,r0,r8
- addi r5,r5,-32
- cmpldi cr7,r5,15
- add r4,r4,r26
- add r9,r9,r27
- bne cr0,L(short_path_prepare_2_2)
- add r4,r28,r4
- std r31,0(r6)
- add r9,r29,r9
- std r10,8(r6)
- ble cr7,L(short_path_1)
-
-L(loop_16b):
- ld r10,16(r11)
- ld r0,24(r11)
- cmpb r8,r10,r30
- cmpb r7,r0,r30
- or. r7,r8,r7
- addi r12,r12,-32
- cmpldi cr7,r12,15
- addi r11,r11,32
- bne cr0,L(short_path_2)
- std r10,16(r6)
- addi r6,r6,32
- std r0,-8(r6)
- bgt cr7,L(loop_start)
-
- mr r5,r12
- mr r4,r11
- mr r9,r6
- b L(short_path_1)
-
- .align 4
-L(short_path_prepare_1_1):
- mr r5,r6
- mr r4,r7
- b L(short_path_1)
-L(short_path_prepare_1_2):
- mr r5,r12
- mr r4,r28
- mr r9,r29
- b L(short_path_1)
-L(short_path_prepare_2):
- mr r9,r3
- b L(short_path_2)
-L(short_path_prepare_2_1):
- mr r5,r6
- mr r4,r7
- b L(short_path_2)
-L(short_path_prepare_2_2):
- mr r5,r12
- mr r4,r11
- mr r9,r6
- b L(short_path_2)
-L(short_path_prepare_2_3):
- mr r5,r12
- mr r4,r28
- mr r9,r29
- b L(short_path_2)
-L(zero_pad_start_prepare_1):
- mr r5,r6
- mr r9,r8
- b L(zero_pad_start_1)
-END (FUNC_NAME)
-
-#ifndef USE_AS_STPNCPY
-libc_hidden_builtin_def (strncpy)
-#endif