sysdeps/m88k/m88100/add_n.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

; mc88100 __mpn_add -- Add two limb vectors of the same length > 0 and store
; sum in a third limb vector.

; Copyright (C) 1992, 1994 Free Software Foundation, Inc.

; This file is part of the GNU MP Library.

; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation; either version 2, or (at your option)
; any later version.

; The GNU MP Library is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; GNU General Public License for more details.

; You should have received a copy of the GNU General Public License
; along with the GNU MP Library; see the file COPYING.  If not, write to
; the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.


; INPUT PARAMETERS
; res_ptr	r2
; s1_ptr	r3
; s2_ptr	r4
; size		r5

; This code has been optimized to run one instruction per clock, avoiding
; load stalls and writeback contention.  As a result, the instruction
; order is not always natural.

; The speed is approximately 4.3 clocks/limb + 18 clocks/limb-vector.

#include "sysdep.h"

ENTRY (__mpn_add_n)
	ld	r6,r3,0			; read first limb from s1_ptr
	extu	r10,r5,4
	ld	r7,r4,0			; read first limb from s2_ptr

	subu.co	r5,r0,r5		; (clear carry as side effect)
	mak	r5,r5,4<4>
	bcnd	eq0,r5,Lzero

	or	r12,r0,lo16(Lbase)
	or.u	r12,r12,hi16(Lbase)
	addu	r12,r12,r5		; r12 is address for entering in loop

	extu	r5,r5,2			; divide by 4
	subu	r2,r2,r5		; adjust res_ptr
	subu	r3,r3,r5		; adjust s1_ptr
	subu	r4,r4,r5		; adjust s2_ptr

	or	r8,r6,r0

	jmp.n	r12
	 or	r9,r7,r0

Loop:	addu	r3,r3,64
	st	r8,r2,60
	addu	r4,r4,64
	ld	r6,r3,0
	addu	r2,r2,64
	ld	r7,r4,0
Lzero:	subu	r10,r10,1	; add 0 + 16r limbs (adjust loop counter)
Lbase:	ld	r8,r3,4
	addu.cio r6,r6,r7
	ld	r9,r4,4
	st	r6,r2,0
	ld	r6,r3,8		; add 15 + 16r limbs
	addu.cio r8,r8,r9
	ld	r7,r4,8
	st	r8,r2,4
	ld	r8,r3,12	; add 14 + 16r limbs
	addu.cio r6,r6,r7
	ld	r9,r4,12
	st	r6,r2,8
	ld	r6,r3,16	; add 13 + 16r limbs
	addu.cio r8,r8,r9
	ld	r7,r4,16
	st	r8,r2,12
	ld	r8,r3,20	; add 12 + 16r limbs
	addu.cio r6,r6,r7
	ld	r9,r4,20
	st	r6,r2,16
	ld	r6,r3,24	; add 11 + 16r limbs
	addu.cio r8,r8,r9
	ld	r7,r4,24
	st	r8,r2,20
	ld	r8,r3,28	; add 10 + 16r limbs
	addu.cio r6,r6,r7
	ld	r9,r4,28
	st	r6,r2,24
	ld	r6,r3,32	; add 9 + 16r limbs
	addu.cio r8,r8,r9
	ld	r7,r4,32
	st	r8,r2,28
	ld	r8,r3,36	; add 8 + 16r limbs
	addu.cio r6,r6,r7
	ld	r9,r4,36
	st	r6,r2,32
	ld	r6,r3,40	; add 7 + 16r limbs
	addu.cio r8,r8,r9
	ld	r7,r4,40
	st	r8,r2,36
	ld	r8,r3,44	; add 6 + 16r limbs
	addu.cio r6,r6,r7
	ld	r9,r4,44
	st	r6,r2,40
	ld	r6,r3,48	; add 5 + 16r limbs
	addu.cio r8,r8,r9
	ld	r7,r4,48
	st	r8,r2,44
	ld	r8,r3,52	; add 4 + 16r limbs
	addu.cio r6,r6,r7
	ld	r9,r4,52
	st	r6,r2,48
	ld	r6,r3,56	; add 3 + 16r limbs
	addu.cio r8,r8,r9
	ld	r7,r4,56
	st	r8,r2,52
	ld	r8,r3,60	; add 2 + 16r limbs
	addu.cio r6,r6,r7
	ld	r9,r4,60
	st	r6,r2,56
	bcnd.n	ne0,r10,Loop	; add 1 + 16r limbs
	 addu.cio r8,r8,r9

	st	r8,r2,60		; store most significant limb

	jmp.n	 r1
	 addu.ci r2,r0,r0		; return carry-out from most sign. limb