aboutsummaryrefslogtreecommitdiff
path: root/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S
diff options
context:
space:
mode:
authorZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
committerZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
commit5046dbb4a7eba5eccfd258f92f4735c9ffc8d069 (patch)
tree4470480d904b65cf14ca524f96f79eca818c3eaf /REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S
parent199fc19d3aaaf57944ef036e15904febe877fc93 (diff)
downloadglibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar
glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.gz
glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.bz2
glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.zip
Prepare for radical source tree reorganization.zack/build-layout-experiment
All top-level files and directories are moved into a temporary storage directory, REORG.TODO, except for files that will certainly still exist in their current form at top level when we're done (COPYING, COPYING.LIB, LICENSES, NEWS, README), all old ChangeLog files (which are moved to the new directory OldChangeLogs, instead), and the generated file INSTALL (which is just deleted; in the new order, there will be no generated files checked into version control).
Diffstat (limited to 'REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S')
-rw-r--r--REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S458
1 files changed, 458 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S
new file mode 100644
index 0000000000..bc734c9f4f
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S
@@ -0,0 +1,458 @@
+/* Optimized memset implementation for PowerPC64/POWER8.
+ Copyright (C) 2014-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#define MTVSRD_V1_R4 .long 0x7c240166 /* mtvsrd v1,r4 */
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+ Returns 's'. */
+
+#ifndef MEMSET
+# define MEMSET memset
+#endif
+
+ /* No need to use .machine power8 since mtvsrd is already
+ handled by the define. It avoid breakage on binutils
+ that does not support this machine specifier. */
+ .machine power7
+EALIGN (MEMSET, 5, 0)
+ CALL_MCOUNT 3
+
+L(_memset):
+ cmpldi cr7,r5,31
+ neg r0,r3
+ mr r10,r3
+
+ insrdi r4,r4,8,48
+ insrdi r4,r4,16,32 /* Replicate byte to word. */
+ ble cr7,L(write_LT_32)
+
+ andi. r11,r10,15 /* Check alignment of DST. */
+ insrdi r4,r4,32,0 /* Replicate word to double word. */
+
+ beq L(big_aligned)
+
+ mtocrf 0x01,r0
+ clrldi r0,r0,60
+
+ /* Get DST aligned to 16 bytes. */
+1: bf 31,2f
+ stb r4,0(r10)
+ addi r10,r10,1
+
+2: bf 30,4f
+ sth r4,0(r10)
+ addi r10,r10,2
+
+4: bf 29,8f
+ stw r4,0(r10)
+ addi r10,r10,4
+
+8: bf 28,16f
+ std r4,0(r10)
+ addi r10,r10,8
+
+16: subf r5,r0,r5
+
+ .align 4
+L(big_aligned):
+ /* For sizes larger than 255 two possible paths:
+ - if constant is '0', zero full cache lines with dcbz
+ - otherwise uses vector instructions. */
+ cmpldi cr5,r5,255
+ dcbtst 0,r10
+ cmpldi cr6,r4,0
+ crand 27,26,21
+ bt 27,L(huge_dcbz)
+ bge cr5,L(huge_vector)
+
+
+ /* Size between 32 and 255 bytes with constant different than 0, use
+ doubleword store instruction to achieve best throughput. */
+ srdi r8,r5,5
+ clrldi r11,r5,59
+ cmpldi cr6,r11,0
+ cmpdi r8,0
+ beq L(tail_bytes)
+ mtctr r8
+
+ /* Main aligned write loop, writes 32-bytes at a time. */
+ .align 4
+L(big_loop):
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ addi r10,r10,32
+ bdz L(tail_bytes)
+
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ addi r10,10,32
+ bdnz L(big_loop)
+
+ b L(tail_bytes)
+
+ /* Write remaining 1~31 bytes. */
+ .align 4
+L(tail_bytes):
+ beqlr cr6
+
+ srdi r7,r11,4
+ clrldi r8,r11,60
+ mtocrf 0x01,r7
+
+ .align 4
+ bf 31,8f
+ std r4,0(r10)
+ std r4,8(r10)
+ addi r10,r10,16
+
+ .align 4
+8: mtocrf 0x1,r8
+ bf 28,4f
+ std r4,0(r10)
+ addi r10,r10,8
+
+ .align 4
+4: bf 29,2f
+ stw 4,0(10)
+ addi 10,10,4
+
+ .align 4
+2: bf 30,1f
+ sth 4,0(10)
+ addi 10,10,2
+
+ .align 4
+1: bflr 31
+ stb 4,0(10)
+ blr
+
+ /* Size larger than 255 bytes with constant different than 0, use
+ vector instruction to achieve best throughput. */
+L(huge_vector):
+ /* Replicate set byte to quadword in VMX register. */
+ MTVSRD_V1_R4
+ xxpermdi 32,v0,v1,0
+ vspltb v2,v0,15
+
+ /* Main aligned write loop: 128 bytes at a time. */
+ li r6,16
+ li r7,32
+ li r8,48
+ mtocrf 0x02,r5
+ srdi r12,r5,7
+ cmpdi r12,0
+ beq L(aligned_tail)
+ mtctr r12
+ b L(aligned_128loop)
+
+ .align 4
+L(aligned_128loop):
+ stvx v2,0,r10
+ stvx v2,r10,r6
+ stvx v2,r10,r7
+ stvx v2,r10,r8
+ addi r10,r10,64
+ stvx v2,0,r10
+ stvx v2,r10,r6
+ stvx v2,r10,r7
+ stvx v2,r10,r8
+ addi r10,r10,64
+ bdnz L(aligned_128loop)
+
+ /* Write remaining 1~127 bytes. */
+L(aligned_tail):
+ mtocrf 0x01,r5
+ bf 25,32f
+ stvx v2,0,r10
+ stvx v2,r10,r6
+ stvx v2,r10,r7
+ stvx v2,r10,r8
+ addi r10,r10,64
+
+32: bf 26,16f
+ stvx v2,0,r10
+ stvx v2,r10,r6
+ addi r10,r10,32
+
+16: bf 27,8f
+ stvx v2,0,r10
+ addi r10,r10,16
+
+8: bf 28,4f
+ std r4,0(r10)
+ addi r10,r10,8
+
+ /* Copies 4~7 bytes. */
+4: bf 29,L(tail2)
+ stw r4,0(r10)
+ bf 30,L(tail5)
+ sth r4,4(r10)
+ bflr 31
+ stb r4,6(r10)
+ /* Return original DST pointer. */
+ blr
+
+ /* Special case when value is 0 and we have a long length to deal
+ with. Use dcbz to zero out a full cacheline of 128 bytes at a time.
+ Before using dcbz though, we need to get the destination 128-byte
+ aligned. */
+ .align 4
+L(huge_dcbz):
+ andi. r11,r10,127
+ neg r0,r10
+ beq L(huge_dcbz_aligned)
+
+ clrldi r0,r0,57
+ subf r5,r0,r5
+ srdi r0,r0,3
+ mtocrf 0x01,r0
+
+ /* Write 1~128 bytes until DST is aligned to 128 bytes. */
+8: bf 28,4f
+
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ std r4,32(r10)
+ std r4,40(r10)
+ std r4,48(r10)
+ std r4,56(r10)
+ addi r10,r10,64
+
+ .align 4
+4: bf 29,2f
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ addi r10,r10,32
+
+ .align 4
+2: bf 30,1f
+ std r4,0(r10)
+ std r4,8(r10)
+ addi r10,r10,16
+
+ .align 4
+1: bf 31,L(huge_dcbz_aligned)
+ std r4,0(r10)
+ addi r10,r10,8
+
+L(huge_dcbz_aligned):
+ /* Setup dcbz unroll offsets and count numbers. */
+ srdi r8,r5,9
+ clrldi r11,r5,55
+ cmpldi cr6,r11,0
+ li r9,128
+ cmpdi r8,0
+ beq L(huge_tail)
+ li r7,256
+ li r6,384
+ mtctr r8
+
+ .align 4
+L(huge_loop):
+ /* Sets 512 bytes to zero in each iteration, the loop unrolling shows
+ a throughput boost for large sizes (2048 bytes or higher). */
+ dcbz 0,r10
+ dcbz r9,r10
+ dcbz r7,r10
+ dcbz r6,r10
+ addi r10,r10,512
+ bdnz L(huge_loop)
+
+ beqlr cr6
+
+L(huge_tail):
+ srdi r6,r11,8
+ srdi r7,r11,4
+ clrldi r8,r11,4
+ cmpldi cr6,r8,0
+ mtocrf 0x01,r6
+
+ beq cr6,L(tail)
+
+ /* We have 1~511 bytes remaining. */
+ .align 4
+32: bf 31,16f
+ dcbz 0,r10
+ dcbz r9,r10
+ addi r10,r10,256
+
+ .align 4
+16: mtocrf 0x01,r7
+ bf 28,8f
+ dcbz 0,r10
+ addi r10,r10,128
+
+ .align 4
+8: bf 29,4f
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ std r4,32(r10)
+ std r4,40(r10)
+ std r4,48(r10)
+ std r4,56(r10)
+ addi r10,r10,64
+
+ .align 4
+4: bf 30,2f
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ addi r10,r10,32
+
+ .align 4
+2: bf 31,L(tail)
+ std r4,0(r10)
+ std r4,8(r10)
+ addi r10,r10,16
+ .align 4
+
+ /* Remaining 1~15 bytes. */
+L(tail):
+ mtocrf 0x01,r8
+
+ .align
+8: bf 28,4f
+ std r4,0(r10)
+ addi r10,r10,8
+
+ .align 4
+4: bf 29,2f
+ stw r4,0(r10)
+ addi r10,r10,4
+
+ .align 4
+2: bf 30,1f
+ sth r4,0(r10)
+ addi r10,r10,2
+
+ .align 4
+1: bflr 31
+ stb r4,0(r10)
+ blr
+
+ /* Handle short copies of 0~31 bytes. Best throughput is achieved
+ by just unrolling all operations. */
+ .align 4
+L(write_LT_32):
+ cmpldi cr6,5,8
+ mtocrf 0x01,r5
+ ble cr6,L(write_LE_8)
+
+ /* At least 9 bytes to go. */
+ neg r8,r4
+ andi. r0,r8,3
+ cmpldi cr1,r5,16
+ beq L(write_LT_32_aligned)
+
+ /* Force 4-byte alignment for SRC. */
+ mtocrf 0x01,r0
+ subf r5,r0,r5
+
+2: bf 30,1f
+ sth r4,0(r10)
+ addi r10,r10,2
+
+1: bf 31,L(end_4bytes_alignment)
+ stb r4,0(r10)
+ addi r10,r10,1
+
+ .align 4
+L(end_4bytes_alignment):
+ cmpldi cr1,r5,16
+ mtocrf 0x01,r5
+
+L(write_LT_32_aligned):
+ blt cr1,8f
+
+ stw r4,0(r10)
+ stw r4,4(r10)
+ stw r4,8(r10)
+ stw r4,12(r10)
+ addi r10,r10,16
+
+8: bf 28,L(tail4)
+ stw r4,0(r10)
+ stw r4,4(r10)
+ addi r10,r10,8
+
+ .align 4
+ /* Copies 4~7 bytes. */
+L(tail4):
+ bf 29,L(tail2)
+ stw r4,0(r10)
+ bf 30,L(tail5)
+ sth r4,4(r10)
+ bflr 31
+ stb r4,6(r10)
+ blr
+
+ .align 4
+ /* Copies 2~3 bytes. */
+L(tail2):
+ bf 30,1f
+ sth r4,0(r10)
+ bflr 31
+ stb r4,2(r10)
+ blr
+
+ .align 4
+L(tail5):
+ bflr 31
+ stb r4,4(r10)
+ blr
+
+ .align 4
+1: bflr 31
+ stb r4,0(r10)
+ blr
+
+ /* Handles copies of 0~8 bytes. */
+ .align 4
+L(write_LE_8):
+ bne cr6,L(tail4)
+
+ stw r4,0(r10)
+ stw r4,4(r10)
+ blr
+END_GEN_TB (MEMSET,TB_TOCLESS)
+libc_hidden_builtin_def (memset)
+
+/* Copied from bzero.S to prevent the linker from inserting a stub
+ between bzero and memset. */
+ENTRY (__bzero)
+ CALL_MCOUNT 3
+ mr r5,r4
+ li r4,0
+ b L(_memset)
+END (__bzero)
+#ifndef __bzero
+weak_alias (__bzero, bzero)
+#endif