1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
/* 128-bit floating point to 32-bit floating point.
Copyright (C) 1997 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
#include <quad_float.h>
/* float _q_qtos(const long double *a);
Convert 'a' to float. Round as per current rounding flags.
Input Rounding Output
+/-0 * +/-0
+/-Inf * +/-Inf
+/-NaN * +/-NaN (with mantissa truncated)
+/-SNaN * +/-NaN (with mantissa truncated, MSB of mantissa <- 1)
&& raise VXSNAN
[Note: just truncating the mantissa may not give you
a SNaN!]
|a|>=2^128 Nearest +/-Inf && raise overflow && raise inexact
|a|>=2^128 Truncate +/-(2^128-2^104) && raise overflow && raise inexact
a>=2^128 +Inf +Inf && raise overflow && raise inexact
a<=-2^128 +Inf -(2^128-2^104) && raise overflow && raise inexact
a>=2^128 -Inf +(2^128-2^104) && raise overflow && raise inexact
a<=-2^128 -Inf -Inf && raise overflow && raise inexact
We also need to raise 'inexact' if the result will be inexact, which
depends on the current rounding mode.
To avoid having to deal with all that, we convert to a 'double'
that will round correctly (but is not itself rounded correctly),
and convert that to a float. This makes this procedure much
simpler and much faster. */
float
__q_qtos(const unsigned long long a[2])
{
unsigned long long a0,d;
union {
double d;
unsigned long long ull;
} u;
a0 = a[0];
/* Truncate the mantissa to 48 bits. */
d = a0 << 4;
/* Set the low bit in the mantissa if any of the bits we are dropping
were 1. This ensures correct rounding, and also distinguishes
0 and Inf from denormalised numbers and SNaN (respectively). */
d |= a[1] != 0;
/* Copy the sign bit. */
d = d & 0x7fffffffffffffffULL | a0 & 0x8000000000000000ULL;
/* Now, we need to fix the exponent. If the exponent of a was in
the range +127 to -152, or was +16384 or -16383, it is already
correct in 'd'. Otherwise, we need to ensure that the new
exponent is in the range +1023 to +128, or -153 to -1022, with
the same sign as the exponent of 'a'. We can do this by setting
bits 1-3 (the second through fourth-most significant bit) of 'd'
to 101 if bit 1 of 'a' is 1, or 010 if bit 1 of 'a' is 0. */
if ((a0 >> 56 & 0x7f) - 0x3f > 1)
{
unsigned t = (a0 >> 32+2 & 2 << 31-1-2)*3 + (2 << 31-2);
d = (d & 0x8fffffffffffffffULL
| (unsigned long long)t<<32 & 0x7000000000000000ULL);
}
u.ull = d;
return (float)u.d;
}
|