Line | Count | Source |
1 | | #include "zserio/FloatUtil.h" |
2 | | |
3 | | namespace zserio |
4 | | { |
5 | | |
6 | | static const uint16_t FLOAT16_SIGN_MASK = UINT16_C(0x8000); |
7 | | static const uint16_t FLOAT16_EXPONENT_MASK = UINT16_C(0x7C00); |
8 | | static const uint16_t FLOAT16_SIGNIFICAND_MASK = UINT16_C(0x03FF); |
9 | | |
10 | | static const uint16_t FLOAT16_SIGN_BIT_POSITION = UINT16_C(15); |
11 | | static const uint16_t FLOAT16_EXPONENT_BIT_POSITION = UINT16_C(10); |
12 | | |
13 | | static const uint16_t FLOAT16_SIGNIFICAND_NUM_BITS = FLOAT16_EXPONENT_BIT_POSITION; |
14 | | |
15 | | static const uint16_t FLOAT16_EXPONENT_INFINITY_NAN = UINT16_C(0x001F); |
16 | | static const uint16_t FLOAT16_EXPONENT_BIAS = UINT16_C(15); |
17 | | |
18 | | static const uint32_t FLOAT32_SIGN_MASK = UINT32_C(0x80000000); |
19 | | static const uint32_t FLOAT32_EXPONENT_MASK = UINT32_C(0x7F800000); |
20 | | static const uint32_t FLOAT32_SIGNIFICAND_MASK = UINT32_C(0x007FFFFF); |
21 | | |
22 | | static const uint32_t FLOAT32_SIGN_BIT_POSITION = UINT32_C(31); |
23 | | static const uint32_t FLOAT32_EXPONENT_BIT_POSITION = UINT32_C(23); |
24 | | |
25 | | static const uint32_t FLOAT32_SIGNIFICAND_NUM_BITS = FLOAT32_EXPONENT_BIT_POSITION; |
26 | | |
27 | | static const uint32_t FLOAT32_EXPONENT_INFINITY_NAN = UINT32_C(0x00FF); |
28 | | static const uint32_t FLOAT32_EXPONENT_BIAS = UINT32_C(127); |
29 | | |
30 | | float convertUInt16ToFloat(uint16_t float16Value) |
31 | 226 | { |
32 | | // decompose half precision float (float16) |
33 | 226 | const uint16_t sign16Shifted = (float16Value & FLOAT16_SIGN_MASK); |
34 | 226 | const uint16_t exponent16 = static_cast<uint16_t>(float16Value & FLOAT16_EXPONENT_MASK) >> |
35 | 226 | FLOAT16_EXPONENT_BIT_POSITION; |
36 | 226 | const uint16_t significand16 = (float16Value & FLOAT16_SIGNIFICAND_MASK); |
37 | | |
38 | | // calculate significand for single precision float (float32) |
39 | 226 | uint32_t significand32 = static_cast<uint32_t>(significand16) << |
40 | 226 | (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS); |
41 | | |
42 | | // calculate exponent for single precision float (float32) |
43 | 226 | uint32_t exponent32 = 0; |
44 | 226 | if (exponent16 == 0) |
45 | 46 | { |
46 | 46 | if (significand32 != 0) |
47 | 2 | { |
48 | | // subnormal (denormal) number will be normalized |
49 | 2 | exponent32 = 1 + FLOAT32_EXPONENT_BIAS - FLOAT16_EXPONENT_BIAS; // exp is initialized by -14 |
50 | | // shift significand until leading bit overflows into exponent bit |
51 | 13 | while ((significand32 & (FLOAT32_SIGNIFICAND_MASK + 1)) == 0) |
52 | 11 | { |
53 | 11 | exponent32--; |
54 | 11 | significand32 <<= 1U; |
55 | 11 | } |
56 | | // mask out overflowed leading bit from significand (normalized has implicit leading bit 1) |
57 | 2 | significand32 &= FLOAT32_SIGNIFICAND_MASK; |
58 | 2 | } |
59 | 46 | } |
60 | 180 | else if (exponent16 == FLOAT16_EXPONENT_INFINITY_NAN) |
61 | 4 | { |
62 | | // infinity or NaN |
63 | 4 | exponent32 = FLOAT32_EXPONENT_INFINITY_NAN; |
64 | 4 | } |
65 | 176 | else |
66 | 176 | { |
67 | | // normal number |
68 | 176 | exponent32 = exponent16 - FLOAT16_EXPONENT_BIAS + FLOAT32_EXPONENT_BIAS; |
69 | 176 | } |
70 | | |
71 | | // compose single precision float (float32) |
72 | 226 | const uint32_t sign32Shifted = static_cast<uint32_t>(sign16Shifted) << (FLOAT32_SIGN_BIT_POSITION - |
73 | 226 | FLOAT16_SIGN_BIT_POSITION); |
74 | 226 | const uint32_t exponent32Shifted = exponent32 << FLOAT32_EXPONENT_BIT_POSITION; |
75 | 226 | const uint32_t float32Value = sign32Shifted | exponent32Shifted | significand32; |
76 | | |
77 | | // convert it to float |
78 | 226 | return convertUInt32ToFloat(float32Value); |
79 | 226 | } |
80 | | |
81 | | uint16_t convertFloatToUInt16(float float32) |
82 | 321 | { |
83 | 321 | const uint32_t float32Value = convertFloatToUInt32(float32); |
84 | | |
85 | | // decompose single precision float (float32) |
86 | 321 | const uint32_t sign32Shifted = (float32Value & FLOAT32_SIGN_MASK); |
87 | 321 | const uint32_t exponent32 = (float32Value & FLOAT32_EXPONENT_MASK) >> FLOAT32_EXPONENT_BIT_POSITION; |
88 | 321 | const uint32_t significand32 = (float32Value & FLOAT32_SIGNIFICAND_MASK); |
89 | | |
90 | | // calculate significand for half precision float (float16) |
91 | 321 | uint16_t significand16 = static_cast<uint16_t>((significand32 >> |
92 | 321 | (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS))); |
93 | | |
94 | | // calculate exponent for half precision float (float16) |
95 | 321 | bool needsRounding = false; |
96 | 321 | uint16_t exponent16 = 0; |
97 | 321 | if (exponent32 == 0) |
98 | 46 | { |
99 | 46 | if (significand32 != 0) |
100 | 2 | { |
101 | | // subnormal (denormal) number will be zero |
102 | 2 | significand16 = 0; |
103 | 2 | } |
104 | 46 | } |
105 | 275 | else if (exponent32 == FLOAT32_EXPONENT_INFINITY_NAN) |
106 | 4 | { |
107 | | // infinity or NaN |
108 | 4 | exponent16 = FLOAT16_EXPONENT_INFINITY_NAN; |
109 | 4 | } |
110 | 271 | else |
111 | 271 | { |
112 | | // normal number |
113 | 271 | const int16_t signedExponent16 = static_cast<int16_t>(static_cast<int32_t>(exponent32) - |
114 | 271 | static_cast<int32_t>(FLOAT32_EXPONENT_BIAS) + static_cast<int32_t>(FLOAT16_EXPONENT_BIAS)); |
115 | 271 | if (signedExponent16 > FLOAT16_EXPONENT_INFINITY_NAN) |
116 | 1 | { |
117 | | // exponent overflow, set infinity or NaN |
118 | 1 | exponent16 = FLOAT16_EXPONENT_INFINITY_NAN; |
119 | 1 | } |
120 | 270 | else if (signedExponent16 <= 0) |
121 | 3 | { |
122 | | // exponent underflow |
123 | 3 | if (signedExponent16 <= static_cast<int16_t>(-FLOAT16_SIGNIFICAND_NUM_BITS)) |
124 | 1 | { |
125 | | // too big underflow, set to zero |
126 | 1 | significand16 = 0; |
127 | 1 | } |
128 | 2 | else |
129 | 2 | { |
130 | | // we can still use subnormal numbers |
131 | 2 | const uint32_t fullSignificand32 = significand32 | (FLOAT32_SIGNIFICAND_MASK + 1); |
132 | 2 | const uint32_t significandShift = static_cast<uint32_t>(1 - signedExponent16); |
133 | 2 | significand16 = static_cast<uint16_t>(fullSignificand32 >> |
134 | 2 | (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS + significandShift)); |
135 | | |
136 | 2 | needsRounding = ((fullSignificand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - |
137 | 2 | FLOAT16_SIGNIFICAND_NUM_BITS + significandShift - 1)) & UINT32_C(1)) != 0; |
138 | 2 | } |
139 | 3 | } |
140 | 267 | else |
141 | 267 | { |
142 | | // exponent ok |
143 | 267 | exponent16 = static_cast<uint16_t>(signedExponent16); |
144 | 267 | needsRounding = ((significand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - |
145 | 267 | FLOAT16_SIGNIFICAND_NUM_BITS - 1)) & UINT32_C(1)) != 0; |
146 | 267 | } |
147 | 271 | } |
148 | | |
149 | | // compose half precision float (float16) |
150 | 321 | const uint16_t sign16Shifted = static_cast<uint16_t>(sign32Shifted >> (FLOAT32_SIGN_BIT_POSITION - |
151 | 321 | FLOAT16_SIGN_BIT_POSITION)); |
152 | 321 | const uint16_t exponent16Shifted = static_cast<uint16_t>(exponent16 << FLOAT16_EXPONENT_BIT_POSITION); |
153 | 321 | uint16_t float16Value = static_cast<uint16_t>(sign16Shifted | exponent16Shifted) | significand16; |
154 | | |
155 | | // check rounding |
156 | 321 | if (needsRounding) |
157 | 2 | float16Value += UINT16_C(1); // might overflow to infinity |
158 | | |
159 | 321 | return float16Value; |
160 | 321 | } |
161 | | |
162 | | float convertUInt32ToFloat(uint32_t float32Value) |
163 | 559 | { |
164 | 559 | const float* convertedFloat = reinterpret_cast<const float*>(static_cast<void*>(&float32Value)); |
165 | | |
166 | 559 | return *convertedFloat; |
167 | 559 | } |
168 | | |
169 | | uint32_t convertFloatToUInt32(float float32) |
170 | 837 | { |
171 | 837 | const uint32_t* float32ValuePtr = reinterpret_cast<const uint32_t*>(static_cast<void*>(&float32)); |
172 | | |
173 | 837 | return *float32ValuePtr; |
174 | 837 | } |
175 | | |
176 | | double convertUInt64ToDouble(uint64_t float64Value) |
177 | 501 | { |
178 | 501 | const double* convertedDouble = reinterpret_cast<const double*>(static_cast<void*>(&float64Value)); |
179 | | |
180 | 501 | return *convertedDouble; |
181 | 501 | } |
182 | | |
183 | | uint64_t convertDoubleToUInt64(double float64) |
184 | 869 | { |
185 | 869 | const uint64_t* float64ValuePtr = reinterpret_cast<const uint64_t*>(static_cast<void*>(&float64)); |
186 | | |
187 | 869 | return *float64ValuePtr; |
188 | 869 | } |
189 | | |
190 | | } // namespace zserio |