[unix-history] / usr / src / lib / libm / national / sqrt.s

; Copyright (c) 1985 Regents of the University of California.
; All rights reserved.
;
; %sccs.include.redist.semicolon%
;
;	@(#)sqrt.s	5.4 (Berkeley) %G%
;

; double sqrt(x)
; double x;
; IEEE double precision sqrt
; code in NSC assembly by K.C. Ng
; 12/13/85
;
; Method:
; 	Use Kahan's trick to get 8 bits initial approximation
;	by integer shift and add/subtract. Then three Newton
;	iterations to get down error to within one ulp. Finally
;	twiddle the last bit to make to correctly rounded 
;	according to the rounding mode.
;
	.vers	2
	.text
	.align	2
	.globl	_sqrt
_sqrt:
	enter	[r3,r4,r5,r6,r7],44
	movl	f4,tos
	movl	f6,tos
	movd	2146435072,r2	; r2 = 0x7ff00000
	movl	8(fp),f0	; f2 = x 
	movd	12(fp),r3	; r3 = high part of x
	movd	r3,r4		; make a copy of high part of x in r4
	andd	r2,r3		; r3 become the bias exponent of x
	cmpd	r2,r3		; if r3 = 0x7ff00000 then x is INF or NAN
	bne	L22
				; to see if x is INF
	movd	8(fp),r0	; r0 = low part of x
	movd	r4,r1		; r1 is high part of x again
	andd	0xfff00000,r1	; mask off the sign and exponent of x
	ord	r0,r1		; or with low part, if 0 then x is INF
	cmpqd	0,r1		; 
	bne	L1		; not 0; therefore x is NaN; return x.
	cmpqd	0,r4		; now x is Inf, is it +inf?
	blt	L1		; +INF, return x 
				; -INF, return NaN by doing 0/0
nan:	movl	0f0.0,f0	; 
	divl	f0,f0
	br	L1
L22:				; now x is finite
	cmpl	0f0.0,f0	; x = 0 ?
	beq	L1		; return x if x is +0 or -0
	cmpqd	0,r4		; Is x < 0 ?
	bgt	nan		; if x < 0 return NaN
	movqd	0,r5		; r5 == scalx initialize to zero
	cmpqd	0,r3		; is x is subnormal ?  (r3 is the exponent)
	bne	L21		; if x is normal, goto L21
	movl	L30,f2		; f2 = 2**54
	mull	f2,f0		; scale up x by 2**54
	subd	0x1b00000,r5	; off set the scale factor by -27 in exponent
L21:
				; now x is normal
				; notations:
				;    r1 == copy of fsr
				;    r2 == mask of e inexact enable flag
				;    r3 == mask of i inexact flag
				;    r4 == mask of r rounding mode
				;    r5 == x's scale factor (already defined)

	movd	0x20,r2
	movd	0x40,r3
	movd	0x180,r4
	sfsr	r0		; store fsr to r0
	movd	r0,r1		; make a copy of fsr to r1
	bicd	[5,6,7,8],r0	; clear e,i, and set r to round to nearest
	lfsr	r0

				; begin to compute sqrt(x)
	movl	f0,-8(fp)
	movd	-4(fp),r0	; r0 the high part of modified x
	lshd	-1,r0		; r0 >> 1
	addd	0x1ff80000,r0	; add correction to r0  ...got 5 bits approx.
	movd	r0,r6
	lshd	-13,r6		; r6 = r0>>-15
	andd	0x7c,r6		; obtain 4*leading 5 bits of r0
	addrd	L29,r7		; r7 = address of L29 = table[0]
	addd	r6,r7		; r6 = address of L29[r6] = table[r6]
	subd	0(r7),r0	; r0 = r0 - table[r6]
	movd	r0,-4(fp)
	movl	-8(fp),f2	; now f2 = y approximate sqrt(f0) to 8 bits

	movl	0f0.5,f6	; f6 = 0.5
	movl	f0,f4
	divl	f2,f4		; t = x/y
	addl	f4,f2		; y = y + x/y
	mull	f6,f2		; y = 0.5(y+x/y) got 17 bits approx.
	movl	f0,f4
	divl	f2,f4		; t = x/y
	addl	f4,f2		; y = y + x/y
	mull	f6,f2		; y = 0.5(y+x/y) got 35 bits approx.
	movl	f0,f4
	divl	f2,f4		; t = x/y
	subl	f2,f4		; t = x/y - y
	mull	f6,f4		; t = 0.5(x/y-y)
	addl	f4,f2		; y = y + 0.5(x/y -y) 
				; now y approx. sqrt(x) to within 1 ulp

				; twiddle last bit to force y correctly rounded
	movd	r1,r0		; restore the old fsr
	bicd	[6,7,8],r0	; clear inexact bit but retain inexact enable
	ord	0x80,r0		; set rounding mode to round to zero
	lfsr	r0

	movl	f0,f4
	divl	f2,f4		; f4 = x/y
	sfsr	r0
	andd	r3,r0		; get the inexact flag
	cmpqd	0,r0
	bne	L18
				; if x/y exact, then ...
	cmpl	f2,f4		; if y == x/y 
	beq	L2
	movl	f4,-8(fp)
	subd	1,-8(fp)
	subcd	0,-4(fp)	
	movl	-8(fp),f4	; f4 = f4 - ulp
L18:
	bicd	[6],r1
	ord	r3,r1		; set inexact flag in r1

	andd	r1,r4		; r4 = the old rounding mode
	cmpqd	0,r4		; round to nearest?
	bne	L17
	movl	f4,-8(fp)
	addd	1,-8(fp)
	addcd	0,-4(fp)
	movl	-8(fp),f4	; f4 = f4 + ulp
	br	L16
L17:
	cmpd	0x100,r4	; round to positive inf ?
	bne	L16
	movl	f4,-8(fp)	
	addd	1,-8(fp)
	addcd	0,-4(fp)
	movl	-8(fp),f4	; f4 = f4 + ulp

	movl	f2,-8(fp)	
	addd	1,-8(fp)
	addcd	0,-4(fp)
	movl	-8(fp),f2	; f2 = f2 + ulp
L16:
	addl	f4,f2		; y  = y + t
	subd	0x100000,r5	; scalx = scalx - 1
L2:
	movl	f2,-8(fp)	
	addd	r5,-4(fp)
	movl	-8(fp),f0
	lfsr	r1
L1:
	movl	tos,f6
	movl	tos,f4
	exit	[r3,r4,r5,r6,r7]
	ret	0
	.data
L28:	.byte	64,40,35,41,115,113,114,116,46,99
	.byte	9,49,46,49,32,40,117,99,98,46
	.byte	101,108,101,102,117,110,116,41,32,57
	.byte	47,49,57,47,56,53,0
L29:	.blkb	4
	.double	1204
	.double	3062
	.double	5746
	.double	9193
	.double	13348
	.double	18162
	.double	23592
	.double	29598
	.double	36145
	.double	43202
	.double	50740
	.double	58733
	.double	67158
	.double	75992
	.double	85215
	.double	83599
	.double	71378
	.double	60428
	.double	50647
	.double	41945
	.double	34246
	.double	27478
	.double	21581
	.double	16499
	.double	12183
	.double	8588
	.double	5674
	.double	3403
	.double	1742
	.double	661
	.double	130
L30:	.blkb	4
	.double	1129316352 	;L30:	.double 0,0x43500000
L31:	.blkb	4
	.double 0x1ff00000
L32:	.blkb	4
	.double 0x5ff00000
Commit	Line	Data
bc0f2b0e KB	1	; Copyright (c) 1985 Regents of the University of California.
	2	; All rights reserved.
	3	;
2bc65d90	4	; %sccs.include.redist.semicolon%
bc0f2b0e	5	;
2bc65d90	6	; @(#)sqrt.s 5.4 (Berkeley) %G%
a399f6c8	7	;
bc0f2b0e	8
3cd28ebc GK	9	; double sqrt(x)
	10	; double x;
	11	; IEEE double precision sqrt
	12	; code in NSC assembly by K.C. Ng
	13	; 12/13/85
	14	;
	15	; Method:
	16	; Use Kahan's trick to get 8 bits initial approximation
	17	; by integer shift and add/subtract. Then three Newton
	18	; iterations to get down error to within one ulp. Finally
	19	; twiddle the last bit to make to correctly rounded
	20	; according to the rounding mode.
	21	;
	22	.vers 2
	23	.text
	24	.align 2
	25	.globl _sqrt
	26	_sqrt:
	27	enter [r3,r4,r5,r6,r7],44
	28	movl f4,tos
	29	movl f6,tos
	30	movd 2146435072,r2 ; r2 = 0x7ff00000
	31	movl 8(fp),f0 ; f2 = x
	32	movd 12(fp),r3 ; r3 = high part of x
	33	movd r3,r4 ; make a copy of high part of x in r4
	34	andd r2,r3 ; r3 become the bias exponent of x
	35	cmpd r2,r3 ; if r3 = 0x7ff00000 then x is INF or NAN
	36	bne L22
	37	; to see if x is INF
	38	movd 8(fp),r0 ; r0 = low part of x
	39	movd r4,r1 ; r1 is high part of x again
	40	andd 0xfff00000,r1 ; mask off the sign and exponent of x
	41	ord r0,r1 ; or with low part, if 0 then x is INF
	42	cmpqd 0,r1 ;
	43	bne L1 ; not 0; therefore x is NaN; return x.
	44	cmpqd 0,r4 ; now x is Inf, is it +inf?
	45	blt L1 ; +INF, return x
	46	; -INF, return NaN by doing 0/0
	47	nan: movl 0f0.0,f0 ;
	48	divl f0,f0
	49	br L1
	50	L22: ; now x is finite
	51	cmpl 0f0.0,f0 ; x = 0 ?
	52	beq L1 ; return x if x is +0 or -0
	53	cmpqd 0,r4 ; Is x < 0 ?
	54	bgt nan ; if x < 0 return NaN
	55	movqd 0,r5 ; r5 == scalx initialize to zero
	56	cmpqd 0,r3 ; is x is subnormal ? (r3 is the exponent)
	57	bne L21 ; if x is normal, goto L21
	58	movl L30,f2 ; f2 = 2**54
	59	mull f2,f0 ; scale up x by 2**54
	60	subd 0x1b00000,r5 ; off set the scale factor by -27 in exponent
	61	L21:
	62	; now x is normal
	63	; notations:
	64	; r1 == copy of fsr
	65	; r2 == mask of e inexact enable flag
	66	; r3 == mask of i inexact flag
	67	; r4 == mask of r rounding mode
	68	; r5 == x's scale factor (already defined)
	69
	70	movd 0x20,r2
	71	movd 0x40,r3
	72	movd 0x180,r4
73	sfsr r0 ; store fsr to r0
74	movd r0,r1 ; make a copy of fsr to r1
75	bicd [5,6,7,8],r0 ; clear e,i, and set r to round to nearest
76	lfsr r0
77
78	; begin to compute sqrt(x)
79	movl f0,-8(fp)
80	movd -4(fp),r0 ; r0 the high part of modified x
81	lshd -1,r0 ; r0 >> 1
82	addd 0x1ff80000,r0 ; add correction to r0 ...got 5 bits approx.
83	movd r0,r6
84	lshd -13,r6 ; r6 = r0>>-15
85	andd 0x7c,r6 ; obtain 4*leading 5 bits of r0
86	addrd L29,r7 ; r7 = address of L29 = table[0]
87	addd r6,r7 ; r6 = address of L29[r6] = table[r6]
88	subd 0(r7),r0 ; r0 = r0 - table[r6]
89	movd r0,-4(fp)
90	movl -8(fp),f2 ; now f2 = y approximate sqrt(f0) to 8 bits
91
92	movl 0f0.5,f6 ; f6 = 0.5
93	movl f0,f4
94	divl f2,f4 ; t = x/y
95	addl f4,f2 ; y = y + x/y
96	mull f6,f2 ; y = 0.5(y+x/y) got 17 bits approx.
97	movl f0,f4
98	divl f2,f4 ; t = x/y
99	addl f4,f2 ; y = y + x/y
100	mull f6,f2 ; y = 0.5(y+x/y) got 35 bits approx.
101	movl f0,f4
102	divl f2,f4 ; t = x/y
103	subl f2,f4 ; t = x/y - y
104	mull f6,f4 ; t = 0.5(x/y-y)
105	addl f4,f2 ; y = y + 0.5(x/y -y)
106	; now y approx. sqrt(x) to within 1 ulp
107
108	; twiddle last bit to force y correctly rounded
109	movd r1,r0 ; restore the old fsr
110	bicd [6,7,8],r0 ; clear inexact bit but retain inexact enable
111	ord 0x80,r0 ; set rounding mode to round to zero
112	lfsr r0
113
114	movl f0,f4
115	divl f2,f4 ; f4 = x/y
116	sfsr r0
117	andd r3,r0 ; get the inexact flag
118	cmpqd 0,r0
119	bne L18
120	; if x/y exact, then ...
121	cmpl f2,f4 ; if y == x/y
122	beq L2
123	movl f4,-8(fp)
124	subd 1,-8(fp)
125	subcd 0,-4(fp)
126	movl -8(fp),f4 ; f4 = f4 - ulp
127	L18:
128	bicd [6],r1
129	ord r3,r1 ; set inexact flag in r1
130
131	andd r1,r4 ; r4 = the old rounding mode
132	cmpqd 0,r4 ; round to nearest?
133	bne L17
134	movl f4,-8(fp)
135	addd 1,-8(fp)
136	addcd 0,-4(fp)
137	movl -8(fp),f4 ; f4 = f4 + ulp
138	br L16
139	L17:
140	cmpd 0x100,r4 ; round to positive inf ?
141	bne L16
142	movl f4,-8(fp)
143	addd 1,-8(fp)
144	addcd 0,-4(fp)
145	movl -8(fp),f4 ; f4 = f4 + ulp
146
147	movl f2,-8(fp)
148	addd 1,-8(fp)
149	addcd 0,-4(fp)
150	movl -8(fp),f2 ; f2 = f2 + ulp
151	L16:
152	addl f4,f2 ; y = y + t
153	subd 0x100000,r5 ; scalx = scalx - 1
154	L2:
155	movl f2,-8(fp)
156	addd r5,-4(fp)
157	movl -8(fp),f0
158	lfsr r1
159	L1:
160	movl tos,f6
161	movl tos,f4
162	exit [r3,r4,r5,r6,r7]
163	ret 0
164	.data
165	L28: .byte 64,40,35,41,115,113,114,116,46,99
166	.byte 9,49,46,49,32,40,117,99,98,46
167	.byte 101,108,101,102,117,110,116,41,32,57
168	.byte 47,49,57,47,56,53,0
169	L29: .blkb 4
170	.double 1204
171	.double 3062
172	.double 5746
173	.double 9193
174	.double 13348
175	.double 18162
176	.double 23592
177	.double 29598
178	.double 36145
179	.double 43202
180	.double 50740
181	.double 58733
182	.double 67158
183	.double 75992
184	.double 85215
185	.double 83599
186	.double 71378
187	.double 60428
188	.double 50647
189	.double 41945
190	.double 34246
191	.double 27478
192	.double 21581
193	.double 16499
194	.double 12183
195	.double 8588
196	.double 5674
197	.double 3403
198	.double 1742
199	.double 661
200	.double 130
201	L30: .blkb 4
202	.double 1129316352 ;L30: .double 0,0x43500000
203	L31: .blkb 4
204	.double 0x1ff00000
205	L32: .blkb 4
206	.double 0x5ff00000