sysdeps/powerpc/memset.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210

/* Optimized memset implementation for PowerPC.
   Copyright (C) 1997, 1999, 2000 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Library General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with the GNU C Library; see the file COPYING.LIB.  If not,
   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
   Boston, MA 02111-1307, USA.  */

#include <sysdep.h>

/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
   Returns 's'.

   The memset is done in three sizes: byte (8 bits), word (32 bits),
   cache line (256 bits). There is a special case for setting cache lines
   to 0, to take advantage of the dcbz instruction.  */

EALIGN (memset, 5, 1)

#define rTMP	r0
#define	rRTN	r3	/* initial value of 1st argument */
#define rCHR	r4	/* char to set in each byte */
#define rLEN	r5	/* length of region to set */
#define rMEMP	r6	/* address at which we are storing */
#define rALIGN	r7	/* number of bytes we are setting now (when aligning) */
#define rMEMP2	r8

#define rPOS32	r7	/* constant +32 for clearing with dcbz */
#define rNEG64	r8	/* constant -64 for clearing with dcbz */
#define rNEG32	r9	/* constant -32 for clearing with dcbz */

/* take care of case for size <= 4  */
	cmplwi	cr1, rLEN, 4
	andi.	rALIGN, rRTN, 3
	mr	rMEMP, rRTN
	ble-	cr1, L(small)
/* align to word boundary  */
	cmplwi	cr5, rLEN, 31
	rlwimi	rCHR, rCHR, 8, 16, 23
	beq+	L(aligned)	/* 8th instruction from .align */
	mtcrf	0x01, rRTN
	subfic	rALIGN, rALIGN, 4
	add	rMEMP, rMEMP, rALIGN
	sub	rLEN, rLEN, rALIGN
	bf+	31, L(g0)
	stb	rCHR, 0(rRTN)
	bt	30, L(aligned)
L(g0):	sth	rCHR, -2(rMEMP)	/* 16th instruction from .align */
/* take care of case for size < 31 */
L(aligned):
	mtcrf	0x01, rLEN
	rlwimi	rCHR, rCHR, 16, 0, 15
	ble	cr5, L(medium)
/* align to cache line boundary...  */
	andi.	rALIGN, rMEMP, 0x1C
	subfic	rALIGN, rALIGN, 0x20
	beq	L(caligned)
	mtcrf	0x01, rALIGN
	add	rMEMP, rMEMP, rALIGN
	sub	rLEN, rLEN, rALIGN
	cmplwi	cr1, rALIGN, 0x10
	mr	rMEMP2, rMEMP
	bf	28, L(a1)
	stw	rCHR, -4(rMEMP2)
	stwu	rCHR, -8(rMEMP2)
L(a1):	blt	cr1, L(a2)
	stw	rCHR, -4(rMEMP2) /* 32nd instruction from .align */
	stw	rCHR, -8(rMEMP2)
	stw	rCHR, -12(rMEMP2)
	stwu	rCHR, -16(rMEMP2)
L(a2):	bf	29, L(caligned)
	stw	rCHR, -4(rMEMP2)
/* now aligned to a cache line.  */
L(caligned):
	cmplwi	cr1, rCHR, 0
	clrrwi.	rALIGN, rLEN, 5
	mtcrf	0x01, rLEN	/* 40th instruction from .align */
	beq	cr1, L(zloopstart) /* special case for clearing memory using dcbz */
	srwi	rTMP, rALIGN, 5
	mtctr	rTMP
	beq	L(medium)	/* we may not actually get to do a full line */
	clrlwi.	rLEN, rLEN, 27
	add	rMEMP, rMEMP, rALIGN
	li	rNEG64, -0x40
	bdz	L(cloopdone)	/* 48th instruction from .align */

L(c3):	dcbz	rNEG64, rMEMP
	stw	rCHR, -4(rMEMP)
	stw	rCHR, -8(rMEMP)
	stw	rCHR, -12(rMEMP)
	stw	rCHR, -16(rMEMP)
	nop			/* let 601 fetch last 4 instructions of loop */
	stw	rCHR, -20(rMEMP)
	stw	rCHR, -24(rMEMP) /* 56th instruction from .align */
	nop			/* let 601 fetch first 8 instructions of loop */
	stw	rCHR, -28(rMEMP)
	stwu	rCHR, -32(rMEMP)
	bdnz	L(c3)
L(cloopdone):
	stw	rCHR, -4(rMEMP)
	stw	rCHR, -8(rMEMP)
	stw	rCHR, -12(rMEMP)
	stw	rCHR, -16(rMEMP) /* 64th instruction from .align */
	stw	rCHR, -20(rMEMP)
	cmplwi	cr1, rLEN, 16
	stw	rCHR, -24(rMEMP)
	stw	rCHR, -28(rMEMP)
	stwu	rCHR, -32(rMEMP)
	beqlr
	add	rMEMP, rMEMP, rALIGN
	b	L(medium_tail2)	/* 72nd instruction from .align */

	.align 5
	nop
/* Clear lines of memory in 128-byte chunks.  */
L(zloopstart):
	clrlwi	rLEN, rLEN, 27
	mtcrf	0x02, rALIGN
	srwi.	rTMP, rALIGN, 7
	mtctr	rTMP
	li	rPOS32, 0x20
	li	rNEG64, -0x40
	cmplwi	cr1, rLEN, 16	/* 8 */
	bf	26, L(z0)
	dcbz	0, rMEMP
	addi	rMEMP, rMEMP, 0x20
L(z0):	li	rNEG32, -0x20
	bf	25, L(z1)
	dcbz	0, rMEMP
	dcbz	rPOS32, rMEMP
	addi	rMEMP, rMEMP, 0x40 /* 16 */
L(z1):	cmplwi	cr5, rLEN, 0
	beq	L(medium)
L(zloop):
	dcbz	0, rMEMP
	dcbz	rPOS32, rMEMP
	addi	rMEMP, rMEMP, 0x80
	dcbz	rNEG64, rMEMP
	dcbz	rNEG32, rMEMP
	bdnz	L(zloop)
	beqlr	cr5
	b	L(medium_tail2)

	.align 5
L(small):
/* Memset of 4 bytes or less.  */
	cmplwi	cr5, rLEN, 1
	cmplwi	cr1, rLEN, 3
	bltlr	cr5
	stb	rCHR, 0(rMEMP)
	beqlr	cr5
	nop
	stb	rCHR, 1(rMEMP)
	bltlr	cr1
	stb	rCHR, 2(rMEMP)
	beqlr	cr1
	nop
	stb	rCHR, 3(rMEMP)
	blr

/* Memset of 0-31 bytes.  */
	.align 5
L(medium):
	cmplwi	cr1, rLEN, 16
L(medium_tail2):
	add	rMEMP, rMEMP, rLEN
L(medium_tail):
	bt-	31, L(medium_31t)
	bt-	30, L(medium_30t)
L(medium_30f):
	bt-	29, L(medium_29t)
L(medium_29f):
	bge-	cr1, L(medium_27t)
	bflr-	28
	stw	rCHR, -4(rMEMP)	/* 8th instruction from .align */
	stw	rCHR, -8(rMEMP)
	blr

L(medium_31t):
	stbu	rCHR, -1(rMEMP)
	bf-	30, L(medium_30f)
L(medium_30t):
	sthu	rCHR, -2(rMEMP)
	bf-	29, L(medium_29f)
L(medium_29t):
	stwu	rCHR, -4(rMEMP)
	blt-	cr1, L(medium_27f) /* 16th instruction from .align */
L(medium_27t):
	stw	rCHR, -4(rMEMP)
	stw	rCHR, -8(rMEMP)
	stw	rCHR, -12(rMEMP)
	stwu	rCHR, -16(rMEMP)
L(medium_27f):
	bflr-	28
L(medium_28t):
	stw	rCHR, -4(rMEMP)
	stw	rCHR, -8(rMEMP)
	blr
END(memset)