Coverage Report

Created: 2024-04-30 09:35

src/zserio/FloatUtil.cpp
Line
Count
Source
1
#include "zserio/FloatUtil.h"
2
3
namespace zserio
4
{
5
6
static const uint16_t FLOAT16_SIGN_MASK = UINT16_C(0x8000);
7
static const uint16_t FLOAT16_EXPONENT_MASK = UINT16_C(0x7C00);
8
static const uint16_t FLOAT16_SIGNIFICAND_MASK = UINT16_C(0x03FF);
9
10
static const uint16_t FLOAT16_SIGN_BIT_POSITION = UINT16_C(15);
11
static const uint16_t FLOAT16_EXPONENT_BIT_POSITION = UINT16_C(10);
12
13
static const uint16_t FLOAT16_SIGNIFICAND_NUM_BITS = FLOAT16_EXPONENT_BIT_POSITION;
14
15
static const uint16_t FLOAT16_EXPONENT_INFINITY_NAN = UINT16_C(0x001F);
16
static const uint16_t FLOAT16_EXPONENT_BIAS = UINT16_C(15);
17
18
static const uint32_t FLOAT32_SIGN_MASK = UINT32_C(0x80000000);
19
static const uint32_t FLOAT32_EXPONENT_MASK = UINT32_C(0x7F800000);
20
static const uint32_t FLOAT32_SIGNIFICAND_MASK = UINT32_C(0x007FFFFF);
21
22
static const uint32_t FLOAT32_SIGN_BIT_POSITION = UINT32_C(31);
23
static const uint32_t FLOAT32_EXPONENT_BIT_POSITION = UINT32_C(23);
24
25
static const uint32_t FLOAT32_SIGNIFICAND_NUM_BITS = FLOAT32_EXPONENT_BIT_POSITION;
26
27
static const uint32_t FLOAT32_EXPONENT_INFINITY_NAN = UINT32_C(0x00FF);
28
static const uint32_t FLOAT32_EXPONENT_BIAS = UINT32_C(127);
29
30
float convertUInt16ToFloat(uint16_t float16Value)
31
226
{
32
    // decompose half precision float (float16)
33
226
    const uint16_t sign16Shifted = (float16Value & FLOAT16_SIGN_MASK);
34
226
    const uint16_t exponent16 = static_cast<uint16_t>(
35
226
            static_cast<uint16_t>(float16Value & FLOAT16_EXPONENT_MASK) >> FLOAT16_EXPONENT_BIT_POSITION);
36
226
    const uint16_t significand16 = (float16Value & FLOAT16_SIGNIFICAND_MASK);
37
38
    // calculate significand for single precision float (float32)
39
226
    uint32_t significand32 = static_cast<uint32_t>(significand16)
40
226
            << (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS);
41
42
    // calculate exponent for single precision float (float32)
43
226
    uint32_t exponent32 = 0;
44
226
    if (exponent16 == 0)
45
46
    {
46
46
        if (significand32 != 0)
47
2
        {
48
            // subnormal (denormal) number will be normalized
49
2
            exponent32 = 1 + FLOAT32_EXPONENT_BIAS - FLOAT16_EXPONENT_BIAS; // exp is initialized by -14
50
            // shift significand until leading bit overflows into exponent bit
51
13
            while ((significand32 & (FLOAT32_SIGNIFICAND_MASK + 1)) == 0)
52
11
            {
53
11
                exponent32--;
54
11
                significand32 <<= 1U;
55
11
            }
56
            // mask out overflowed leading bit from significand (normalized has implicit leading bit 1)
57
2
            significand32 &= FLOAT32_SIGNIFICAND_MASK;
58
2
        }
59
46
    }
60
180
    else if (exponent16 == FLOAT16_EXPONENT_INFINITY_NAN)
61
4
    {
62
        // infinity or NaN
63
4
        exponent32 = FLOAT32_EXPONENT_INFINITY_NAN;
64
4
    }
65
176
    else
66
176
    {
67
        // normal number
68
176
        exponent32 = exponent16 - FLOAT16_EXPONENT_BIAS + FLOAT32_EXPONENT_BIAS;
69
176
    }
70
71
    // compose single precision float (float32)
72
226
    const uint32_t sign32Shifted = static_cast<uint32_t>(sign16Shifted)
73
226
            << (FLOAT32_SIGN_BIT_POSITION - FLOAT16_SIGN_BIT_POSITION);
74
226
    const uint32_t exponent32Shifted = exponent32 << FLOAT32_EXPONENT_BIT_POSITION;
75
226
    const uint32_t float32Value = sign32Shifted | exponent32Shifted | significand32;
76
77
    // convert it to float
78
226
    return convertUInt32ToFloat(float32Value);
79
226
}
80
81
uint16_t convertFloatToUInt16(float float32)
82
321
{
83
321
    const uint32_t float32Value = convertFloatToUInt32(float32);
84
85
    // decompose single precision float (float32)
86
321
    const uint32_t sign32Shifted = (float32Value & FLOAT32_SIGN_MASK);
87
321
    const uint32_t exponent32 = (float32Value & FLOAT32_EXPONENT_MASK) >> FLOAT32_EXPONENT_BIT_POSITION;
88
321
    const uint32_t significand32 = (float32Value & FLOAT32_SIGNIFICAND_MASK);
89
90
    // calculate significand for half precision float (float16)
91
321
    uint16_t significand16 = static_cast<uint16_t>(
92
321
            (significand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS)));
93
94
    // calculate exponent for half precision float (float16)
95
321
    bool needsRounding = false;
96
321
    uint16_t exponent16 = 0;
97
321
    if (exponent32 == 0)
98
46
    {
99
46
        if (significand32 != 0)
100
2
        {
101
            // subnormal (denormal) number will be zero
102
2
            significand16 = 0;
103
2
        }
104
46
    }
105
275
    else if (exponent32 == FLOAT32_EXPONENT_INFINITY_NAN)
106
4
    {
107
        // infinity or NaN
108
4
        exponent16 = FLOAT16_EXPONENT_INFINITY_NAN;
109
4
    }
110
271
    else
111
271
    {
112
        // normal number
113
271
        const int16_t signedExponent16 = static_cast<int16_t>(static_cast<int32_t>(exponent32) -
114
271
                static_cast<int32_t>(FLOAT32_EXPONENT_BIAS) + static_cast<int32_t>(FLOAT16_EXPONENT_BIAS));
115
271
        if (signedExponent16 > FLOAT16_EXPONENT_INFINITY_NAN)
116
1
        {
117
            // exponent overflow, set infinity or NaN
118
1
            exponent16 = FLOAT16_EXPONENT_INFINITY_NAN;
119
1
        }
120
270
        else if (signedExponent16 <= 0)
121
3
        {
122
            // exponent underflow
123
3
            if (signedExponent16 <= static_cast<int16_t>(-FLOAT16_SIGNIFICAND_NUM_BITS))
124
1
            {
125
                // too big underflow, set to zero
126
1
                significand16 = 0;
127
1
            }
128
2
            else
129
2
            {
130
                // we can still use subnormal numbers
131
2
                const uint32_t fullSignificand32 = significand32 | (FLOAT32_SIGNIFICAND_MASK + 1);
132
2
                const uint32_t significandShift = static_cast<uint32_t>(1 - signedExponent16);
133
2
                significand16 = static_cast<uint16_t>(fullSignificand32 >>
134
2
                        (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS + significandShift));
135
136
2
                needsRounding =
137
2
                        ((fullSignificand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS +
138
2
                                                       significandShift - 1)) &
139
2
                                UINT32_C(1)) != 0;
140
2
            }
141
3
        }
142
267
        else
143
267
        {
144
            // exponent ok
145
267
            exponent16 = static_cast<uint16_t>(signedExponent16);
146
267
            needsRounding =
147
267
                    ((significand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS - 1)) &
148
267
                            UINT32_C(1)) != 0;
149
267
        }
150
271
    }
151
152
    // compose half precision float (float16)
153
321
    const uint16_t sign16Shifted =
154
321
            static_cast<uint16_t>(sign32Shifted >> (FLOAT32_SIGN_BIT_POSITION - FLOAT16_SIGN_BIT_POSITION));
155
321
    const uint16_t exponent16Shifted = static_cast<uint16_t>(exponent16 << FLOAT16_EXPONENT_BIT_POSITION);
156
321
    uint16_t float16Value = static_cast<uint16_t>(sign16Shifted | exponent16Shifted) | significand16;
157
158
    // check rounding
159
321
    if (needsRounding)
160
2
        ++float16Value; // might overflow to infinity
161
162
321
    return float16Value;
163
321
}
164
165
float convertUInt32ToFloat(uint32_t float32Value)
166
559
{
167
559
    const float* convertedFloat = static_cast<const float*>(static_cast<void*>(&float32Value));
168
169
559
    return *convertedFloat;
170
559
}
171
172
uint32_t convertFloatToUInt32(float float32)
173
837
{
174
837
    const uint32_t* float32ValuePtr = static_cast<const uint32_t*>(static_cast<void*>(&float32));
175
176
837
    return *float32ValuePtr;
177
837
}
178
179
double convertUInt64ToDouble(uint64_t float64Value)
180
501
{
181
501
    const double* convertedDouble = static_cast<const double*>(static_cast<void*>(&float64Value));
182
183
501
    return *convertedDouble;
184
501
}
185
186
uint64_t convertDoubleToUInt64(double float64)
187
869
{
188
869
    const uint64_t* float64ValuePtr = static_cast<const uint64_t*>(static_cast<void*>(&float64));
189
190
869
    return *float64ValuePtr;
191
869
}
192
193
} // namespace zserio