Line | Count | Source |
1 | | #include "zserio/FloatUtil.h" |
2 | | |
3 | | namespace zserio |
4 | | { |
5 | | |
6 | | static const uint16_t FLOAT16_SIGN_MASK = UINT16_C(0x8000); |
7 | | static const uint16_t FLOAT16_EXPONENT_MASK = UINT16_C(0x7C00); |
8 | | static const uint16_t FLOAT16_SIGNIFICAND_MASK = UINT16_C(0x03FF); |
9 | | |
10 | | static const uint16_t FLOAT16_SIGN_BIT_POSITION = UINT16_C(15); |
11 | | static const uint16_t FLOAT16_EXPONENT_BIT_POSITION = UINT16_C(10); |
12 | | |
13 | | static const uint16_t FLOAT16_SIGNIFICAND_NUM_BITS = FLOAT16_EXPONENT_BIT_POSITION; |
14 | | |
15 | | static const uint16_t FLOAT16_EXPONENT_INFINITY_NAN = UINT16_C(0x001F); |
16 | | static const uint16_t FLOAT16_EXPONENT_BIAS = UINT16_C(15); |
17 | | |
18 | | static const uint32_t FLOAT32_SIGN_MASK = UINT32_C(0x80000000); |
19 | | static const uint32_t FLOAT32_EXPONENT_MASK = UINT32_C(0x7F800000); |
20 | | static const uint32_t FLOAT32_SIGNIFICAND_MASK = UINT32_C(0x007FFFFF); |
21 | | |
22 | | static const uint32_t FLOAT32_SIGN_BIT_POSITION = UINT32_C(31); |
23 | | static const uint32_t FLOAT32_EXPONENT_BIT_POSITION = UINT32_C(23); |
24 | | |
25 | | static const uint32_t FLOAT32_SIGNIFICAND_NUM_BITS = FLOAT32_EXPONENT_BIT_POSITION; |
26 | | |
27 | | static const uint32_t FLOAT32_EXPONENT_INFINITY_NAN = UINT32_C(0x00FF); |
28 | | static const uint32_t FLOAT32_EXPONENT_BIAS = UINT32_C(127); |
29 | | |
30 | | float convertUInt16ToFloat(uint16_t float16Value) |
31 | 226 | { |
32 | | // decompose half precision float (float16) |
33 | 226 | const uint16_t sign16Shifted = (float16Value & FLOAT16_SIGN_MASK); |
34 | 226 | const uint16_t exponent16 = static_cast<uint16_t>( |
35 | 226 | static_cast<uint16_t>(float16Value & FLOAT16_EXPONENT_MASK) >> FLOAT16_EXPONENT_BIT_POSITION); |
36 | 226 | const uint16_t significand16 = (float16Value & FLOAT16_SIGNIFICAND_MASK); |
37 | | |
38 | | // calculate significand for single precision float (float32) |
39 | 226 | uint32_t significand32 = static_cast<uint32_t>(significand16) |
40 | 226 | << (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS); |
41 | | |
42 | | // calculate exponent for single precision float (float32) |
43 | 226 | uint32_t exponent32 = 0; |
44 | 226 | if (exponent16 == 0) |
45 | 46 | { |
46 | 46 | if (significand32 != 0) |
47 | 2 | { |
48 | | // subnormal (denormal) number will be normalized |
49 | 2 | exponent32 = 1 + FLOAT32_EXPONENT_BIAS - FLOAT16_EXPONENT_BIAS; // exp is initialized by -14 |
50 | | // shift significand until leading bit overflows into exponent bit |
51 | 13 | while ((significand32 & (FLOAT32_SIGNIFICAND_MASK + 1)) == 0) |
52 | 11 | { |
53 | 11 | exponent32--; |
54 | 11 | significand32 <<= 1U; |
55 | 11 | } |
56 | | // mask out overflowed leading bit from significand (normalized has implicit leading bit 1) |
57 | 2 | significand32 &= FLOAT32_SIGNIFICAND_MASK; |
58 | 2 | } |
59 | 46 | } |
60 | 180 | else if (exponent16 == FLOAT16_EXPONENT_INFINITY_NAN) |
61 | 4 | { |
62 | | // infinity or NaN |
63 | 4 | exponent32 = FLOAT32_EXPONENT_INFINITY_NAN; |
64 | 4 | } |
65 | 176 | else |
66 | 176 | { |
67 | | // normal number |
68 | 176 | exponent32 = exponent16 - FLOAT16_EXPONENT_BIAS + FLOAT32_EXPONENT_BIAS; |
69 | 176 | } |
70 | | |
71 | | // compose single precision float (float32) |
72 | 226 | const uint32_t sign32Shifted = static_cast<uint32_t>(sign16Shifted) |
73 | 226 | << (FLOAT32_SIGN_BIT_POSITION - FLOAT16_SIGN_BIT_POSITION); |
74 | 226 | const uint32_t exponent32Shifted = exponent32 << FLOAT32_EXPONENT_BIT_POSITION; |
75 | 226 | const uint32_t float32Value = sign32Shifted | exponent32Shifted | significand32; |
76 | | |
77 | | // convert it to float |
78 | 226 | return convertUInt32ToFloat(float32Value); |
79 | 226 | } |
80 | | |
81 | | uint16_t convertFloatToUInt16(float float32) |
82 | 321 | { |
83 | 321 | const uint32_t float32Value = convertFloatToUInt32(float32); |
84 | | |
85 | | // decompose single precision float (float32) |
86 | 321 | const uint32_t sign32Shifted = (float32Value & FLOAT32_SIGN_MASK); |
87 | 321 | const uint32_t exponent32 = (float32Value & FLOAT32_EXPONENT_MASK) >> FLOAT32_EXPONENT_BIT_POSITION; |
88 | 321 | const uint32_t significand32 = (float32Value & FLOAT32_SIGNIFICAND_MASK); |
89 | | |
90 | | // calculate significand for half precision float (float16) |
91 | 321 | uint16_t significand16 = static_cast<uint16_t>( |
92 | 321 | (significand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS))); |
93 | | |
94 | | // calculate exponent for half precision float (float16) |
95 | 321 | bool needsRounding = false; |
96 | 321 | uint16_t exponent16 = 0; |
97 | 321 | if (exponent32 == 0) |
98 | 46 | { |
99 | 46 | if (significand32 != 0) |
100 | 2 | { |
101 | | // subnormal (denormal) number will be zero |
102 | 2 | significand16 = 0; |
103 | 2 | } |
104 | 46 | } |
105 | 275 | else if (exponent32 == FLOAT32_EXPONENT_INFINITY_NAN) |
106 | 4 | { |
107 | | // infinity or NaN |
108 | 4 | exponent16 = FLOAT16_EXPONENT_INFINITY_NAN; |
109 | 4 | } |
110 | 271 | else |
111 | 271 | { |
112 | | // normal number |
113 | 271 | const int16_t signedExponent16 = static_cast<int16_t>(static_cast<int32_t>(exponent32) - |
114 | 271 | static_cast<int32_t>(FLOAT32_EXPONENT_BIAS) + static_cast<int32_t>(FLOAT16_EXPONENT_BIAS)); |
115 | 271 | if (signedExponent16 > FLOAT16_EXPONENT_INFINITY_NAN) |
116 | 1 | { |
117 | | // exponent overflow, set infinity or NaN |
118 | 1 | exponent16 = FLOAT16_EXPONENT_INFINITY_NAN; |
119 | 1 | } |
120 | 270 | else if (signedExponent16 <= 0) |
121 | 3 | { |
122 | | // exponent underflow |
123 | 3 | if (signedExponent16 <= static_cast<int16_t>(-FLOAT16_SIGNIFICAND_NUM_BITS)) |
124 | 1 | { |
125 | | // too big underflow, set to zero |
126 | 1 | significand16 = 0; |
127 | 1 | } |
128 | 2 | else |
129 | 2 | { |
130 | | // we can still use subnormal numbers |
131 | 2 | const uint32_t fullSignificand32 = significand32 | (FLOAT32_SIGNIFICAND_MASK + 1); |
132 | 2 | const uint32_t significandShift = static_cast<uint32_t>(1 - signedExponent16); |
133 | 2 | significand16 = static_cast<uint16_t>(fullSignificand32 >> |
134 | 2 | (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS + significandShift)); |
135 | | |
136 | 2 | needsRounding = |
137 | 2 | ((fullSignificand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS + |
138 | 2 | significandShift - 1)) & |
139 | 2 | UINT32_C(1)) != 0; |
140 | 2 | } |
141 | 3 | } |
142 | 267 | else |
143 | 267 | { |
144 | | // exponent ok |
145 | 267 | exponent16 = static_cast<uint16_t>(signedExponent16); |
146 | 267 | needsRounding = |
147 | 267 | ((significand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS - 1)) & |
148 | 267 | UINT32_C(1)) != 0; |
149 | 267 | } |
150 | 271 | } |
151 | | |
152 | | // compose half precision float (float16) |
153 | 321 | const uint16_t sign16Shifted = |
154 | 321 | static_cast<uint16_t>(sign32Shifted >> (FLOAT32_SIGN_BIT_POSITION - FLOAT16_SIGN_BIT_POSITION)); |
155 | 321 | const uint16_t exponent16Shifted = static_cast<uint16_t>(exponent16 << FLOAT16_EXPONENT_BIT_POSITION); |
156 | 321 | uint16_t float16Value = static_cast<uint16_t>(sign16Shifted | exponent16Shifted) | significand16; |
157 | | |
158 | | // check rounding |
159 | 321 | if (needsRounding) |
160 | 2 | ++float16Value; // might overflow to infinity |
161 | | |
162 | 321 | return float16Value; |
163 | 321 | } |
164 | | |
165 | | float convertUInt32ToFloat(uint32_t float32Value) |
166 | 559 | { |
167 | 559 | const float* convertedFloat = static_cast<const float*>(static_cast<void*>(&float32Value)); |
168 | | |
169 | 559 | return *convertedFloat; |
170 | 559 | } |
171 | | |
172 | | uint32_t convertFloatToUInt32(float float32) |
173 | 837 | { |
174 | 837 | const uint32_t* float32ValuePtr = static_cast<const uint32_t*>(static_cast<void*>(&float32)); |
175 | | |
176 | 837 | return *float32ValuePtr; |
177 | 837 | } |
178 | | |
179 | | double convertUInt64ToDouble(uint64_t float64Value) |
180 | 501 | { |
181 | 501 | const double* convertedDouble = static_cast<const double*>(static_cast<void*>(&float64Value)); |
182 | | |
183 | 501 | return *convertedDouble; |
184 | 501 | } |
185 | | |
186 | | uint64_t convertDoubleToUInt64(double float64) |
187 | 869 | { |
188 | 869 | const uint64_t* float64ValuePtr = static_cast<const uint64_t*>(static_cast<void*>(&float64)); |
189 | | |
190 | 869 | return *float64ValuePtr; |
191 | 869 | } |
192 | | |
193 | | } // namespace zserio |