aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/powerpc/powerpc64/power8/memset.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power8/memset.S')
-rw-r--r--sysdeps/powerpc/powerpc64/power8/memset.S458
1 files changed, 0 insertions, 458 deletions
diff --git a/sysdeps/powerpc/powerpc64/power8/memset.S b/sysdeps/powerpc/powerpc64/power8/memset.S
deleted file mode 100644
index bc734c9f4f..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/memset.S
+++ /dev/null
@@ -1,458 +0,0 @@
-/* Optimized memset implementation for PowerPC64/POWER8.
- Copyright (C) 2014-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#define MTVSRD_V1_R4 .long 0x7c240166 /* mtvsrd v1,r4 */
-
-/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
- Returns 's'. */
-
-#ifndef MEMSET
-# define MEMSET memset
-#endif
-
- /* No need to use .machine power8 since mtvsrd is already
- handled by the define. It avoid breakage on binutils
- that does not support this machine specifier. */
- .machine power7
-EALIGN (MEMSET, 5, 0)
- CALL_MCOUNT 3
-
-L(_memset):
- cmpldi cr7,r5,31
- neg r0,r3
- mr r10,r3
-
- insrdi r4,r4,8,48
- insrdi r4,r4,16,32 /* Replicate byte to word. */
- ble cr7,L(write_LT_32)
-
- andi. r11,r10,15 /* Check alignment of DST. */
- insrdi r4,r4,32,0 /* Replicate word to double word. */
-
- beq L(big_aligned)
-
- mtocrf 0x01,r0
- clrldi r0,r0,60
-
- /* Get DST aligned to 16 bytes. */
-1: bf 31,2f
- stb r4,0(r10)
- addi r10,r10,1
-
-2: bf 30,4f
- sth r4,0(r10)
- addi r10,r10,2
-
-4: bf 29,8f
- stw r4,0(r10)
- addi r10,r10,4
-
-8: bf 28,16f
- std r4,0(r10)
- addi r10,r10,8
-
-16: subf r5,r0,r5
-
- .align 4
-L(big_aligned):
- /* For sizes larger than 255 two possible paths:
- - if constant is '0', zero full cache lines with dcbz
- - otherwise uses vector instructions. */
- cmpldi cr5,r5,255
- dcbtst 0,r10
- cmpldi cr6,r4,0
- crand 27,26,21
- bt 27,L(huge_dcbz)
- bge cr5,L(huge_vector)
-
-
- /* Size between 32 and 255 bytes with constant different than 0, use
- doubleword store instruction to achieve best throughput. */
- srdi r8,r5,5
- clrldi r11,r5,59
- cmpldi cr6,r11,0
- cmpdi r8,0
- beq L(tail_bytes)
- mtctr r8
-
- /* Main aligned write loop, writes 32-bytes at a time. */
- .align 4
-L(big_loop):
- std r4,0(r10)
- std r4,8(r10)
- std r4,16(r10)
- std r4,24(r10)
- addi r10,r10,32
- bdz L(tail_bytes)
-
- std r4,0(r10)
- std r4,8(r10)
- std r4,16(r10)
- std r4,24(r10)
- addi r10,10,32
- bdnz L(big_loop)
-
- b L(tail_bytes)
-
- /* Write remaining 1~31 bytes. */
- .align 4
-L(tail_bytes):
- beqlr cr6
-
- srdi r7,r11,4
- clrldi r8,r11,60
- mtocrf 0x01,r7
-
- .align 4
- bf 31,8f
- std r4,0(r10)
- std r4,8(r10)
- addi r10,r10,16
-
- .align 4
-8: mtocrf 0x1,r8
- bf 28,4f
- std r4,0(r10)
- addi r10,r10,8
-
- .align 4
-4: bf 29,2f
- stw 4,0(10)
- addi 10,10,4
-
- .align 4
-2: bf 30,1f
- sth 4,0(10)
- addi 10,10,2
-
- .align 4
-1: bflr 31
- stb 4,0(10)
- blr
-
- /* Size larger than 255 bytes with constant different than 0, use
- vector instruction to achieve best throughput. */
-L(huge_vector):
- /* Replicate set byte to quadword in VMX register. */
- MTVSRD_V1_R4
- xxpermdi 32,v0,v1,0
- vspltb v2,v0,15
-
- /* Main aligned write loop: 128 bytes at a time. */
- li r6,16
- li r7,32
- li r8,48
- mtocrf 0x02,r5
- srdi r12,r5,7
- cmpdi r12,0
- beq L(aligned_tail)
- mtctr r12
- b L(aligned_128loop)
-
- .align 4
-L(aligned_128loop):
- stvx v2,0,r10
- stvx v2,r10,r6
- stvx v2,r10,r7
- stvx v2,r10,r8
- addi r10,r10,64
- stvx v2,0,r10
- stvx v2,r10,r6
- stvx v2,r10,r7
- stvx v2,r10,r8
- addi r10,r10,64
- bdnz L(aligned_128loop)
-
- /* Write remaining 1~127 bytes. */
-L(aligned_tail):
- mtocrf 0x01,r5
- bf 25,32f
- stvx v2,0,r10
- stvx v2,r10,r6
- stvx v2,r10,r7
- stvx v2,r10,r8
- addi r10,r10,64
-
-32: bf 26,16f
- stvx v2,0,r10
- stvx v2,r10,r6
- addi r10,r10,32
-
-16: bf 27,8f
- stvx v2,0,r10
- addi r10,r10,16
-
-8: bf 28,4f
- std r4,0(r10)
- addi r10,r10,8
-
- /* Copies 4~7 bytes. */
-4: bf 29,L(tail2)
- stw r4,0(r10)
- bf 30,L(tail5)
- sth r4,4(r10)
- bflr 31
- stb r4,6(r10)
- /* Return original DST pointer. */
- blr
-
- /* Special case when value is 0 and we have a long length to deal
- with. Use dcbz to zero out a full cacheline of 128 bytes at a time.
- Before using dcbz though, we need to get the destination 128-byte
- aligned. */
- .align 4
-L(huge_dcbz):
- andi. r11,r10,127
- neg r0,r10
- beq L(huge_dcbz_aligned)
-
- clrldi r0,r0,57
- subf r5,r0,r5
- srdi r0,r0,3
- mtocrf 0x01,r0
-
- /* Write 1~128 bytes until DST is aligned to 128 bytes. */
-8: bf 28,4f
-
- std r4,0(r10)
- std r4,8(r10)
- std r4,16(r10)
- std r4,24(r10)
- std r4,32(r10)
- std r4,40(r10)
- std r4,48(r10)
- std r4,56(r10)
- addi r10,r10,64
-
- .align 4
-4: bf 29,2f
- std r4,0(r10)
- std r4,8(r10)
- std r4,16(r10)
- std r4,24(r10)
- addi r10,r10,32
-
- .align 4
-2: bf 30,1f
- std r4,0(r10)
- std r4,8(r10)
- addi r10,r10,16
-
- .align 4
-1: bf 31,L(huge_dcbz_aligned)
- std r4,0(r10)
- addi r10,r10,8
-
-L(huge_dcbz_aligned):
- /* Setup dcbz unroll offsets and count numbers. */
- srdi r8,r5,9
- clrldi r11,r5,55
- cmpldi cr6,r11,0
- li r9,128
- cmpdi r8,0
- beq L(huge_tail)
- li r7,256
- li r6,384
- mtctr r8
-
- .align 4
-L(huge_loop):
- /* Sets 512 bytes to zero in each iteration, the loop unrolling shows
- a throughput boost for large sizes (2048 bytes or higher). */
- dcbz 0,r10
- dcbz r9,r10
- dcbz r7,r10
- dcbz r6,r10
- addi r10,r10,512
- bdnz L(huge_loop)
-
- beqlr cr6
-
-L(huge_tail):
- srdi r6,r11,8
- srdi r7,r11,4
- clrldi r8,r11,4
- cmpldi cr6,r8,0
- mtocrf 0x01,r6
-
- beq cr6,L(tail)
-
- /* We have 1~511 bytes remaining. */
- .align 4
-32: bf 31,16f
- dcbz 0,r10
- dcbz r9,r10
- addi r10,r10,256
-
- .align 4
-16: mtocrf 0x01,r7
- bf 28,8f
- dcbz 0,r10
- addi r10,r10,128
-
- .align 4
-8: bf 29,4f
- std r4,0(r10)
- std r4,8(r10)
- std r4,16(r10)
- std r4,24(r10)
- std r4,32(r10)
- std r4,40(r10)
- std r4,48(r10)
- std r4,56(r10)
- addi r10,r10,64
-
- .align 4
-4: bf 30,2f
- std r4,0(r10)
- std r4,8(r10)
- std r4,16(r10)
- std r4,24(r10)
- addi r10,r10,32
-
- .align 4
-2: bf 31,L(tail)
- std r4,0(r10)
- std r4,8(r10)
- addi r10,r10,16
- .align 4
-
- /* Remaining 1~15 bytes. */
-L(tail):
- mtocrf 0x01,r8
-
- .align
-8: bf 28,4f
- std r4,0(r10)
- addi r10,r10,8
-
- .align 4
-4: bf 29,2f
- stw r4,0(r10)
- addi r10,r10,4
-
- .align 4
-2: bf 30,1f
- sth r4,0(r10)
- addi r10,r10,2
-
- .align 4
-1: bflr 31
- stb r4,0(r10)
- blr
-
- /* Handle short copies of 0~31 bytes. Best throughput is achieved
- by just unrolling all operations. */
- .align 4
-L(write_LT_32):
- cmpldi cr6,5,8
- mtocrf 0x01,r5
- ble cr6,L(write_LE_8)
-
- /* At least 9 bytes to go. */
- neg r8,r4
- andi. r0,r8,3
- cmpldi cr1,r5,16
- beq L(write_LT_32_aligned)
-
- /* Force 4-byte alignment for SRC. */
- mtocrf 0x01,r0
- subf r5,r0,r5
-
-2: bf 30,1f
- sth r4,0(r10)
- addi r10,r10,2
-
-1: bf 31,L(end_4bytes_alignment)
- stb r4,0(r10)
- addi r10,r10,1
-
- .align 4
-L(end_4bytes_alignment):
- cmpldi cr1,r5,16
- mtocrf 0x01,r5
-
-L(write_LT_32_aligned):
- blt cr1,8f
-
- stw r4,0(r10)
- stw r4,4(r10)
- stw r4,8(r10)
- stw r4,12(r10)
- addi r10,r10,16
-
-8: bf 28,L(tail4)
- stw r4,0(r10)
- stw r4,4(r10)
- addi r10,r10,8
-
- .align 4
- /* Copies 4~7 bytes. */
-L(tail4):
- bf 29,L(tail2)
- stw r4,0(r10)
- bf 30,L(tail5)
- sth r4,4(r10)
- bflr 31
- stb r4,6(r10)
- blr
-
- .align 4
- /* Copies 2~3 bytes. */
-L(tail2):
- bf 30,1f
- sth r4,0(r10)
- bflr 31
- stb r4,2(r10)
- blr
-
- .align 4
-L(tail5):
- bflr 31
- stb r4,4(r10)
- blr
-
- .align 4
-1: bflr 31
- stb r4,0(r10)
- blr
-
- /* Handles copies of 0~8 bytes. */
- .align 4
-L(write_LE_8):
- bne cr6,L(tail4)
-
- stw r4,0(r10)
- stw r4,4(r10)
- blr
-END_GEN_TB (MEMSET,TB_TOCLESS)
-libc_hidden_builtin_def (memset)
-
-/* Copied from bzero.S to prevent the linker from inserting a stub
- between bzero and memset. */
-ENTRY (__bzero)
- CALL_MCOUNT 3
- mr r5,r4
- li r4,0
- b L(_memset)
-END (__bzero)
-#ifndef __bzero
-weak_alias (__bzero, bzero)
-#endif