1 |
|
|
#include "zserio/FloatUtil.h" |
2 |
|
|
|
3 |
|
|
namespace zserio |
4 |
|
|
{ |
5 |
|
|
|
6 |
|
|
static const uint16_t FLOAT16_SIGN_MASK = UINT16_C(0x8000); |
7 |
|
|
static const uint16_t FLOAT16_EXPONENT_MASK = UINT16_C(0x7C00); |
8 |
|
|
static const uint16_t FLOAT16_SIGNIFICAND_MASK = UINT16_C(0x03FF); |
9 |
|
|
|
10 |
|
|
static const uint16_t FLOAT16_SIGN_BIT_POSITION = UINT16_C(15); |
11 |
|
|
static const uint16_t FLOAT16_EXPONENT_BIT_POSITION = UINT16_C(10); |
12 |
|
|
|
13 |
|
|
static const uint16_t FLOAT16_SIGNIFICAND_NUM_BITS = FLOAT16_EXPONENT_BIT_POSITION; |
14 |
|
|
|
15 |
|
|
static const uint16_t FLOAT16_EXPONENT_INFINITY_NAN = UINT16_C(0x001F); |
16 |
|
|
static const uint16_t FLOAT16_EXPONENT_BIAS = UINT16_C(15); |
17 |
|
|
|
18 |
|
|
static const uint32_t FLOAT32_SIGN_MASK = UINT32_C(0x80000000); |
19 |
|
|
static const uint32_t FLOAT32_EXPONENT_MASK = UINT32_C(0x7F800000); |
20 |
|
|
static const uint32_t FLOAT32_SIGNIFICAND_MASK = UINT32_C(0x007FFFFF); |
21 |
|
|
|
22 |
|
|
static const uint32_t FLOAT32_SIGN_BIT_POSITION = UINT32_C(31); |
23 |
|
|
static const uint32_t FLOAT32_EXPONENT_BIT_POSITION = UINT32_C(23); |
24 |
|
|
|
25 |
|
|
static const uint32_t FLOAT32_SIGNIFICAND_NUM_BITS = FLOAT32_EXPONENT_BIT_POSITION; |
26 |
|
|
|
27 |
|
|
static const uint32_t FLOAT32_EXPONENT_INFINITY_NAN = UINT32_C(0x00FF); |
28 |
|
|
static const uint32_t FLOAT32_EXPONENT_BIAS = UINT32_C(127); |
29 |
|
|
|
30 |
|
226 |
float convertUInt16ToFloat(uint16_t float16Value) |
31 |
|
|
{ |
32 |
|
|
// decompose half precision float (float16) |
33 |
|
226 |
const uint16_t sign16Shifted = (float16Value & FLOAT16_SIGN_MASK); |
34 |
|
226 |
const uint16_t exponent16 = static_cast<uint16_t>(float16Value & FLOAT16_EXPONENT_MASK) >> |
35 |
|
226 |
FLOAT16_EXPONENT_BIT_POSITION; |
36 |
|
226 |
const uint16_t significand16 = (float16Value & FLOAT16_SIGNIFICAND_MASK); |
37 |
|
|
|
38 |
|
|
// calculate significand for single precision float (float32) |
39 |
|
226 |
uint32_t significand32 = static_cast<uint32_t>(significand16) << |
40 |
|
226 |
(FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS); |
41 |
|
|
|
42 |
|
|
// calculate exponent for single precision float (float32) |
43 |
|
226 |
uint32_t exponent32 = 0; |
44 |
✓✓ |
226 |
if (exponent16 == 0) |
45 |
|
|
{ |
46 |
✓✓ |
46 |
if (significand32 != 0) |
47 |
|
|
{ |
48 |
|
|
// subnormal (denormal) number will be normalized |
49 |
|
2 |
exponent32 = 1 + FLOAT32_EXPONENT_BIAS - FLOAT16_EXPONENT_BIAS; // exp is initialized by -14 |
50 |
|
|
// shift significand until leading bit overflows into exponent bit |
51 |
✓✓ |
24 |
while ((significand32 & (FLOAT32_SIGNIFICAND_MASK + 1)) == 0) |
52 |
|
|
{ |
53 |
|
11 |
exponent32--; |
54 |
|
11 |
significand32 <<= 1U; |
55 |
|
|
} |
56 |
|
|
// mask out overflowed leading bit from significand (normalized has implicit leading bit 1) |
57 |
|
2 |
significand32 &= FLOAT32_SIGNIFICAND_MASK; |
58 |
|
|
} |
59 |
|
|
} |
60 |
✓✓ |
180 |
else if (exponent16 == FLOAT16_EXPONENT_INFINITY_NAN) |
61 |
|
|
{ |
62 |
|
|
// infinity or NaN |
63 |
|
4 |
exponent32 = FLOAT32_EXPONENT_INFINITY_NAN; |
64 |
|
|
} |
65 |
|
|
else |
66 |
|
|
{ |
67 |
|
|
// normal number |
68 |
|
176 |
exponent32 = exponent16 - FLOAT16_EXPONENT_BIAS + FLOAT32_EXPONENT_BIAS; |
69 |
|
|
} |
70 |
|
|
|
71 |
|
|
// compose single precision float (float32) |
72 |
|
226 |
const uint32_t sign32Shifted = static_cast<uint32_t>(sign16Shifted) << (FLOAT32_SIGN_BIT_POSITION - |
73 |
|
226 |
FLOAT16_SIGN_BIT_POSITION); |
74 |
|
226 |
const uint32_t exponent32Shifted = exponent32 << FLOAT32_EXPONENT_BIT_POSITION; |
75 |
|
226 |
const uint32_t float32Value = sign32Shifted | exponent32Shifted | significand32; |
76 |
|
|
|
77 |
|
|
// convert it to float |
78 |
|
226 |
return convertUInt32ToFloat(float32Value); |
79 |
|
|
} |
80 |
|
|
|
81 |
|
321 |
uint16_t convertFloatToUInt16(float float32) |
82 |
|
|
{ |
83 |
|
321 |
const uint32_t float32Value = convertFloatToUInt32(float32); |
84 |
|
|
|
85 |
|
|
// decompose single precision float (float32) |
86 |
|
321 |
const uint32_t sign32Shifted = (float32Value & FLOAT32_SIGN_MASK); |
87 |
|
321 |
const uint32_t exponent32 = (float32Value & FLOAT32_EXPONENT_MASK) >> FLOAT32_EXPONENT_BIT_POSITION; |
88 |
|
321 |
const uint32_t significand32 = (float32Value & FLOAT32_SIGNIFICAND_MASK); |
89 |
|
|
|
90 |
|
|
// calculate significand for half precision float (float16) |
91 |
|
321 |
uint16_t significand16 = static_cast<uint16_t>((significand32 >> |
92 |
|
321 |
(FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS))); |
93 |
|
|
|
94 |
|
|
// calculate exponent for half precision float (float16) |
95 |
|
321 |
bool needsRounding = false; |
96 |
|
321 |
uint16_t exponent16 = 0; |
97 |
✓✓ |
321 |
if (exponent32 == 0) |
98 |
|
|
{ |
99 |
✓✓ |
46 |
if (significand32 != 0) |
100 |
|
|
{ |
101 |
|
|
// subnormal (denormal) number will be zero |
102 |
|
2 |
significand16 = 0; |
103 |
|
|
} |
104 |
|
|
} |
105 |
✓✓ |
275 |
else if (exponent32 == FLOAT32_EXPONENT_INFINITY_NAN) |
106 |
|
|
{ |
107 |
|
|
// infinity or NaN |
108 |
|
4 |
exponent16 = FLOAT16_EXPONENT_INFINITY_NAN; |
109 |
|
|
} |
110 |
|
|
else |
111 |
|
|
{ |
112 |
|
|
// normal number |
113 |
|
|
const int16_t signedExponent16 = static_cast<int16_t>(static_cast<int32_t>(exponent32) - |
114 |
|
271 |
static_cast<int32_t>(FLOAT32_EXPONENT_BIAS) + static_cast<int32_t>(FLOAT16_EXPONENT_BIAS)); |
115 |
✓✓ |
271 |
if (signedExponent16 > FLOAT16_EXPONENT_INFINITY_NAN) |
116 |
|
|
{ |
117 |
|
|
// exponent overflow, set infinity or NaN |
118 |
|
1 |
exponent16 = FLOAT16_EXPONENT_INFINITY_NAN; |
119 |
|
|
} |
120 |
✓✓ |
270 |
else if (signedExponent16 <= 0) |
121 |
|
|
{ |
122 |
|
|
// exponent underflow |
123 |
✓✓ |
3 |
if (signedExponent16 <= static_cast<int16_t>(-FLOAT16_SIGNIFICAND_NUM_BITS)) |
124 |
|
|
{ |
125 |
|
|
// too big underflow, set to zero |
126 |
|
1 |
significand16 = 0; |
127 |
|
|
} |
128 |
|
|
else |
129 |
|
|
{ |
130 |
|
|
// we can still use subnormal numbers |
131 |
|
2 |
const uint32_t fullSignificand32 = significand32 | (FLOAT32_SIGNIFICAND_MASK + 1); |
132 |
|
2 |
const uint32_t significandShift = static_cast<uint32_t>(1 - signedExponent16); |
133 |
|
2 |
significand16 = static_cast<uint16_t>(fullSignificand32 >> |
134 |
|
4 |
(FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS + significandShift)); |
135 |
|
|
|
136 |
|
2 |
needsRounding = ((fullSignificand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - |
137 |
|
4 |
FLOAT16_SIGNIFICAND_NUM_BITS + significandShift - 1)) & UINT32_C(1)) != 0; |
138 |
|
|
} |
139 |
|
|
} |
140 |
|
|
else |
141 |
|
|
{ |
142 |
|
|
// exponent ok |
143 |
|
267 |
exponent16 = static_cast<uint16_t>(signedExponent16); |
144 |
|
|
needsRounding = ((significand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - |
145 |
|
267 |
FLOAT16_SIGNIFICAND_NUM_BITS - 1)) & UINT32_C(1)) != 0; |
146 |
|
|
} |
147 |
|
|
} |
148 |
|
|
|
149 |
|
|
// compose half precision float (float16) |
150 |
|
321 |
const uint16_t sign16Shifted = static_cast<uint16_t>(sign32Shifted >> (FLOAT32_SIGN_BIT_POSITION - |
151 |
|
321 |
FLOAT16_SIGN_BIT_POSITION)); |
152 |
|
321 |
const uint16_t exponent16Shifted = static_cast<uint16_t>(exponent16 << FLOAT16_EXPONENT_BIT_POSITION); |
153 |
|
321 |
uint16_t float16Value = static_cast<uint16_t>(sign16Shifted | exponent16Shifted) | significand16; |
154 |
|
|
|
155 |
|
|
// check rounding |
156 |
✓✓ |
321 |
if (needsRounding) |
157 |
|
2 |
float16Value += UINT16_C(1); // might overflow to infinity |
158 |
|
|
|
159 |
|
321 |
return float16Value; |
160 |
|
|
} |
161 |
|
|
|
162 |
|
559 |
float convertUInt32ToFloat(uint32_t float32Value) |
163 |
|
|
{ |
164 |
|
559 |
const float* convertedFloat = reinterpret_cast<const float*>(static_cast<void*>(&float32Value)); |
165 |
|
|
|
166 |
|
559 |
return *convertedFloat; |
167 |
|
|
} |
168 |
|
|
|
169 |
|
837 |
uint32_t convertFloatToUInt32(float float32) |
170 |
|
|
{ |
171 |
|
837 |
const uint32_t* float32ValuePtr = reinterpret_cast<const uint32_t*>(static_cast<void*>(&float32)); |
172 |
|
|
|
173 |
|
837 |
return *float32ValuePtr; |
174 |
|
|
} |
175 |
|
|
|
176 |
|
501 |
double convertUInt64ToDouble(uint64_t float64Value) |
177 |
|
|
{ |
178 |
|
501 |
const double* convertedDouble = reinterpret_cast<const double*>(static_cast<void*>(&float64Value)); |
179 |
|
|
|
180 |
|
501 |
return *convertedDouble; |
181 |
|
|
} |
182 |
|
|
|
183 |
|
869 |
uint64_t convertDoubleToUInt64(double float64) |
184 |
|
|
{ |
185 |
|
869 |
const uint64_t* float64ValuePtr = reinterpret_cast<const uint64_t*>(static_cast<void*>(&float64)); |
186 |
|
|
|
187 |
|
869 |
return *float64ValuePtr; |
188 |
|
|
} |
189 |
|
|
|
190 |
|
|
} // namespace zserio |