Prepare for radical source tree reorganization.zack/build-layout-experiment

All top-level files and directories are moved into a temporary storage directory, REORG.TODO, except for files that will certainly still exist in their current form at top level when we're done (COPYING, COPYING.LIB, LICENSES, NEWS, README), all old ChangeLog files (which are moved to the new directory OldChangeLogs, instead), and the generated file INSTALL (which is just deleted; in the new order, there will be no generated files checked into version control).
author: Zack Weinberg <zackw@panix.com> 2017-06-08 15:39:03 -0400
committer: Zack Weinberg <zackw@panix.com> 2017-06-08 15:39:03 -0400
commit: 5046dbb4a7eba5eccfd258f92f4735c9ffc8d069 (patch)
tree: 4470480d904b65cf14ca524f96f79eca818c3eaf /REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S
parent: 199fc19d3aaaf57944ef036e15904febe877fc93 (diff)
download: glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar
glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.gz
glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.bz2
glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.zip
1 files changed, 458 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S
new file mode 100644
index 0000000000..bc734c9f4f
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S
@@ -0,0 +1,458 @@
+/* Optimized memset implementation for PowerPC64/POWER8.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define MTVSRD_V1_R4  .long 0x7c240166     /* mtvsrd  v1,r4  */
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+   Returns 's'.  */
+
+#ifndef MEMSET
+# define MEMSET memset
+#endif
+
+	/* No need to use .machine power8 since mtvsrd is already
+	   handled by the define.  It avoid breakage on binutils
+	   that does not support this machine specifier.  */
+	.machine power7
+EALIGN (MEMSET, 5, 0)
+	CALL_MCOUNT 3
+
+L(_memset):
+	cmpldi	cr7,r5,31
+	neg	r0,r3
+	mr	r10,r3
+
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32	/* Replicate byte to word.  */
+	ble	cr7,L(write_LT_32)
+
+	andi.	r11,r10,15	/* Check alignment of DST.  */
+	insrdi	r4,r4,32,0	/* Replicate word to double word.  */
+
+	beq	L(big_aligned)
+
+	mtocrf	0x01,r0
+	clrldi	r0,r0,60
+
+	/* Get DST aligned to 16 bytes.  */
+1:	bf	31,2f
+	stb	r4,0(r10)
+	addi	r10,r10,1
+
+2:	bf	30,4f
+	sth	r4,0(r10)
+	addi	r10,r10,2
+
+4:	bf	29,8f
+	stw	r4,0(r10)
+	addi	r10,r10,4
+
+8:	bf      28,16f
+	std     r4,0(r10)
+	addi    r10,r10,8
+
+16:	subf	r5,r0,r5
+
+	.align	4
+L(big_aligned):
+	/* For sizes larger than 255 two possible paths:
+	   - if constant is '0', zero full cache lines with dcbz
+	   - otherwise uses vector instructions.  */
+	cmpldi	cr5,r5,255
+	dcbtst	0,r10
+	cmpldi	cr6,r4,0
+	crand	27,26,21
+	bt	27,L(huge_dcbz)
+	bge	cr5,L(huge_vector)
+
+
+	/* Size between 32 and 255 bytes with constant different than 0, use
+	   doubleword store instruction to achieve best throughput.  */
+	srdi    r8,r5,5
+	clrldi  r11,r5,59
+	cmpldi  cr6,r11,0
+	cmpdi	r8,0
+	beq     L(tail_bytes)
+	mtctr   r8
+
+	/* Main aligned write loop, writes 32-bytes at a time.  */
+	.align  4
+L(big_loop):
+	std     r4,0(r10)
+	std     r4,8(r10)
+	std     r4,16(r10)
+	std     r4,24(r10)
+	addi    r10,r10,32
+	bdz     L(tail_bytes)
+
+	std     r4,0(r10)
+	std     r4,8(r10)
+	std     r4,16(r10)
+	std     r4,24(r10)
+	addi    r10,10,32
+	bdnz    L(big_loop)
+
+	b       L(tail_bytes)
+
+	/* Write remaining 1~31 bytes.  */
+	.align  4
+L(tail_bytes):
+	beqlr   cr6
+
+	srdi    r7,r11,4
+	clrldi  r8,r11,60
+	mtocrf  0x01,r7
+
+	.align	4
+	bf	31,8f
+	std	r4,0(r10)
+	std	r4,8(r10)
+	addi	r10,r10,16
+
+	.align	4
+8:	mtocrf	0x1,r8
+	bf	28,4f
+	std	r4,0(r10)
+	addi	r10,r10,8
+
+	.align	4
+4:	bf      29,2f
+	stw     4,0(10)
+	addi    10,10,4
+
+	.align 	4
+2:	bf      30,1f
+	sth     4,0(10)
+	addi    10,10,2
+
+	.align  4
+1:      bflr    31
+	stb     4,0(10)
+	blr
+
+	/* Size larger than 255 bytes with constant different than 0, use
+	   vector instruction to achieve best throughput.  */
+L(huge_vector):
+	/* Replicate set byte to quadword in VMX register.  */
+	MTVSRD_V1_R4
+	xxpermdi 32,v0,v1,0
+	vspltb	 v2,v0,15
+
+	/* Main aligned write loop: 128 bytes at a time.  */
+	li	r6,16
+	li	r7,32
+	li	r8,48
+	mtocrf	0x02,r5
+	srdi	r12,r5,7
+	cmpdi	r12,0
+	beq	L(aligned_tail)
+	mtctr	r12
+	b	L(aligned_128loop)
+
+	.align  4
+L(aligned_128loop):
+	stvx	v2,0,r10
+	stvx	v2,r10,r6
+	stvx	v2,r10,r7
+	stvx	v2,r10,r8
+	addi	r10,r10,64
+	stvx	v2,0,r10
+	stvx	v2,r10,r6
+	stvx	v2,r10,r7
+	stvx	v2,r10,r8
+	addi	r10,r10,64
+	bdnz	L(aligned_128loop)
+
+	/* Write remaining 1~127 bytes.  */
+L(aligned_tail):
+	mtocrf	0x01,r5
+	bf	25,32f
+	stvx	v2,0,r10
+	stvx	v2,r10,r6
+	stvx	v2,r10,r7
+	stvx	v2,r10,r8
+	addi	r10,r10,64
+
+32:	bf	26,16f
+	stvx	v2,0,r10
+	stvx	v2,r10,r6
+	addi	r10,r10,32
+
+16:	bf	27,8f
+	stvx	v2,0,r10
+	addi	r10,r10,16
+
+8:	bf	28,4f
+	std     r4,0(r10)
+	addi	r10,r10,8
+
+	/* Copies 4~7 bytes.  */
+4:	bf	29,L(tail2)
+	stw     r4,0(r10)
+	bf      30,L(tail5)
+	sth     r4,4(r10)
+	bflr	31
+	stb     r4,6(r10)
+	/* Return original DST pointer.  */
+	blr
+
+	/* Special case when value is 0 and we have a long length to deal
+	   with.  Use dcbz to zero out a full cacheline of 128 bytes at a time.
+	   Before using dcbz though, we need to get the destination 128-byte
+	   aligned.  */
+	.align	4
+L(huge_dcbz):
+	andi.	r11,r10,127
+	neg	r0,r10
+	beq	L(huge_dcbz_aligned)
+
+	clrldi	r0,r0,57
+	subf	r5,r0,r5
+	srdi	r0,r0,3
+	mtocrf	0x01,r0
+
+	/* Write 1~128 bytes until DST is aligned to 128 bytes.  */
+8:	bf	28,4f
+
+	std	r4,0(r10)
+	std	r4,8(r10)
+	std	r4,16(r10)
+	std	r4,24(r10)
+	std	r4,32(r10)
+	std	r4,40(r10)
+	std	r4,48(r10)
+	std	r4,56(r10)
+	addi	r10,r10,64
+
+	.align	4
+4:	bf	29,2f
+	std	r4,0(r10)
+	std	r4,8(r10)
+	std	r4,16(r10)
+	std	r4,24(r10)
+	addi	r10,r10,32
+
+	.align	4
+2:	bf	30,1f
+	std	r4,0(r10)
+	std	r4,8(r10)
+	addi	r10,r10,16
+
+	.align	4
+1:	bf	31,L(huge_dcbz_aligned)
+	std	r4,0(r10)
+	addi	r10,r10,8
+
+L(huge_dcbz_aligned):
+	/* Setup dcbz unroll offsets and count numbers.  */
+	srdi	r8,r5,9
+	clrldi	r11,r5,55
+	cmpldi	cr6,r11,0
+	li	r9,128
+	cmpdi	r8,0
+	beq     L(huge_tail)
+	li	r7,256
+	li	r6,384
+	mtctr	r8
+
+	.align	4
+L(huge_loop):
+	/* Sets 512 bytes to zero in each iteration, the loop unrolling shows
+	   a throughput boost for large sizes (2048 bytes or higher).  */
+	dcbz	0,r10
+	dcbz	r9,r10
+	dcbz	r7,r10
+	dcbz	r6,r10
+	addi	r10,r10,512
+	bdnz	L(huge_loop)
+
+	beqlr	cr6
+
+L(huge_tail):
+	srdi    r6,r11,8
+	srdi    r7,r11,4
+	clrldi  r8,r11,4
+	cmpldi  cr6,r8,0
+	mtocrf  0x01,r6
+
+	beq	cr6,L(tail)
+
+	/* We have 1~511 bytes remaining.  */
+	.align	4
+32:	bf	31,16f
+	dcbz	0,r10
+	dcbz	r9,r10
+	addi	r10,r10,256
+
+	.align	4
+16:	mtocrf  0x01,r7
+	bf	28,8f
+	dcbz	0,r10
+	addi	r10,r10,128
+
+	.align 	4
+8:	bf	29,4f
+	std	r4,0(r10)
+	std	r4,8(r10)
+	std	r4,16(r10)
+	std	r4,24(r10)
+	std	r4,32(r10)
+	std	r4,40(r10)
+	std	r4,48(r10)
+	std	r4,56(r10)
+	addi	r10,r10,64
+
+	.align	4
+4:	bf	30,2f
+	std	r4,0(r10)
+	std	r4,8(r10)
+	std	r4,16(r10)
+	std	r4,24(r10)
+	addi	r10,r10,32
+
+	.align	4
+2:	bf	31,L(tail)
+	std	r4,0(r10)
+	std	r4,8(r10)
+	addi	r10,r10,16
+	.align	4
+
+	/* Remaining 1~15 bytes.  */
+L(tail):
+	mtocrf  0x01,r8
+
+	.align
+8:	bf	28,4f
+	std	r4,0(r10)
+	addi	r10,r10,8
+
+	.align	4
+4:	bf	29,2f
+	stw	r4,0(r10)
+	addi	r10,r10,4
+
+	.align	4
+2:	bf	30,1f
+	sth	r4,0(r10)
+	addi	r10,r10,2
+
+	.align	4
+1:	bflr	31
+	stb	r4,0(r10)
+	blr
+
+	/* Handle short copies of 0~31 bytes.  Best throughput is achieved
+	   by just unrolling all operations.  */
+	.align	4
+L(write_LT_32):
+	cmpldi	cr6,5,8
+	mtocrf	0x01,r5
+	ble	cr6,L(write_LE_8)
+
+	/* At least 9 bytes to go.  */
+	neg	r8,r4
+	andi.	r0,r8,3
+	cmpldi	cr1,r5,16
+	beq	L(write_LT_32_aligned)
+
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,r0
+	subf	r5,r0,r5
+
+2:	bf	30,1f
+	sth	r4,0(r10)
+	addi	r10,r10,2
+
+1:	bf	31,L(end_4bytes_alignment)
+	stb	r4,0(r10)
+	addi	r10,r10,1
+
+	.align	4
+L(end_4bytes_alignment):
+	cmpldi	cr1,r5,16
+	mtocrf	0x01,r5
+
+L(write_LT_32_aligned):
+	blt	cr1,8f
+
+	stw	r4,0(r10)
+	stw	r4,4(r10)
+	stw	r4,8(r10)
+	stw	r4,12(r10)
+	addi	r10,r10,16
+
+8:	bf	28,L(tail4)
+	stw	r4,0(r10)
+	stw	r4,4(r10)
+	addi	r10,r10,8
+
+	.align	4
+	/* Copies 4~7 bytes.  */
+L(tail4):
+	bf	29,L(tail2)
+	stw	r4,0(r10)
+	bf	30,L(tail5)
+	sth	r4,4(r10)
+	bflr	31
+	stb	r4,6(r10)
+	blr
+
+	.align	4
+	/* Copies 2~3 bytes.  */
+L(tail2):
+	bf	30,1f
+	sth	r4,0(r10)
+	bflr	31
+	stb	r4,2(r10)
+	blr
+
+	.align	4
+L(tail5):
+	bflr	31
+	stb	r4,4(r10)
+	blr
+
+	.align	4
+1: 	bflr	31
+	stb	r4,0(r10)
+	blr
+
+	/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(write_LE_8):
+	bne	cr6,L(tail4)
+
+	stw	r4,0(r10)
+	stw	r4,4(r10)
+	blr
+END_GEN_TB (MEMSET,TB_TOCLESS)
+libc_hidden_builtin_def (memset)
+
+/* Copied from bzero.S to prevent the linker from inserting a stub
+   between bzero and memset.  */
+ENTRY (__bzero)
+	CALL_MCOUNT 3
+	mr	r5,r4
+	li	r4,0
+	b	L(_memset)
+END (__bzero)
+#ifndef __bzero
+weak_alias (__bzero, bzero)
+#endif
author	Zack Weinberg <zackw@panix.com>	2017-06-08 15:39:03 -0400
committer	Zack Weinberg <zackw@panix.com>	2017-06-08 15:39:03 -0400
commit	5046dbb4a7eba5eccfd258f92f4735c9ffc8d069 (patch)
tree	4470480d904b65cf14ca524f96f79eca818c3eaf /REORG.TODO/sysdeps/powerpc/powerpc64/power8/memset.S
parent	199fc19d3aaaf57944ef036e15904febe877fc93 (diff)
download	glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.gz glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.bz2 glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.zip