-
Notifications
You must be signed in to change notification settings - Fork 16
/
protocolfloatspecial.cpp
568 lines (453 loc) · 17.9 KB
/
protocolfloatspecial.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
#include "protocolfloatspecial.h"
ProtocolFloatSpecial::ProtocolFloatSpecial(ProtocolSupport protocolsupport) :
header(protocolsupport),
source(protocolsupport),
support(protocolsupport)
{}
//! Perform the generation, writing out the files
bool ProtocolFloatSpecial::generate(std::vector<std::string>& fileNameList, std::vector<std::string>& filePathList)
{
if(support.specialFloat && generateHeader())
{
fileNameList.push_back(header.fileName());
filePathList.push_back(header.filePath());
if(generateSource())
{
fileNameList.push_back(source.fileName());
filePathList.push_back(source.filePath());
return true;
}
}
return false;
}
//! Generate the header file
bool ProtocolFloatSpecial::generateHeader(void)
{
header.setModuleNameAndPath("floatspecial", support.outputpath);
// Raw string magic here
header.setFileComment(R"(\brief Special routines for floating point manipulation
These routines allow floating point values to be compressed to
smaller formats by discarding resolution and dynamic range. This is
useful for saving space in data messages for fields that have a lot of
dynamic range, but not a lot of required resolution.
float16 and float24 are not defined by IEEE-754, but uses the same rules.
The most significant bit is a sign bit, the next bits are biased exponent
bits, and the remaining bits are used for the significand. float16 and
float24 have a variable number of signifcand bits, which can be adjusted
as needed to best fit the application.
Note that IEEE-754 defines a binary16 format (also called half-precision),
which uses a 10-bit significand (therefore 5 bits of exponent). float16
with 10 bits significand is the same as IEEE-754 half precision.
float16 and float24 cannot be used for arithmetic. Accordingly this module
only provides routines to convert between these and binary32 (float). In
memory floating point numbers are always IEEE-754 binary32 or IEEE-754
binary64. The in-memory representation of a float16 or float24 is actually
an integer which can be encoded into a data message like any integer)");
header.makeLineSeparator();
header.writeIncludeDirective("stdint.h", std::string(), true);
header.makeLineSeparator();
// Raw string magic here
header.write(R"(//! Determine if a 32-bit field represents a valid 32-bit IEEE-754 floating point number.
int isFloat32Valid(uint32_t value);
//! Determine if a 64-bit field represents a valid 64-bit IEEE-754 floating point number.
int isFloat64Valid(uint64_t value);
//! Convert a 32-bit floating point value to 24-bit floating point
uint32_t float32ToFloat24(float value, int sigbits);
//! Convert a IEEE-754 binary24 floating point representation to binary32
float float24ToFloat32(uint32_t value, int sigbits);
//! Convert a 32-bit floating point value to 16-bit floating point representation
uint16_t float32ToFloat16(float value, int sigbits);
//! Convert a 16 bit floating point representation to binary32
float float16ToFloat32(uint16_t value, int sigbits);
//! test the special float functionality
int testSpecialFloat(void);)");
header.makeLineSeparator();
return header.flush();
}// ProtocolFloatSpecial::generateHeader
//! Generate the encode source file
bool ProtocolFloatSpecial::generateSource(void)
{
source.setModuleNameAndPath("floatspecial", support.outputpath);
source.writeIncludeDirective("math.h", "", true);
source.makeLineSeparator();
// Raw string magic here
source.write(R"===(/*!
* Determine if a 32-bit field represents a valid 32-bit IEEE-754 floating
* point number. If the field is infinity, NaN, or de-normalized then it is
* not valid. This determination is made without using any floating point
* instructions.
* \param value is the field to evaluate
* \return 0 if field is infinity, NaN, or de-normalized, else 1
*/
int isFloat32Valid(uint32_t value)
{
// Five cases for floating point numbers:
// 0) The exponent is greater than zero and less than the maximum. This is a normal non-zero number.
// 1) The exponent and the significand are zero. This is zero.
// 2) The exponent is zero, and the significant is non-zero. This is denormalized.
// 3) The exponent is the maximum value, and the significand is zero. This is infinity.
// 4) The exponent is the maximum value, and the significand is non-zero. This is NaN.
// We check for cases 2, 3, 4 and return 0 if it happens
if((value & 0x7F800000ul) == (0x7F800000ul))
{
// inifinity or NaN.
return 0;
}// if the exponent is the maximum value
else if((value & 0x7F800000ul) == 0)
{
// Check for denormalized number
if(value & 0x007FFFFFul)
return 0;
}// else if the exponent is zero
// If we get here then its a valid float
return 1;
}// isFloat32Valid
/*!
* Determine if a 64-bit field represents a valid 64-bit IEEE-754 floating
* point number. If the field is infinity, NaN, or de-normalized then it is
* not valid. This determination is made without using any floating point
* instructions.
* \param value is the field to evaluate
* \return 0 if field is infinity, NaN, or de-normalized, else 1
*/
int isFloat64Valid(uint64_t value)
{
// Five cases for floating point numbers:
// 0) The exponent is greater than zero and less than the maximum. This is a normal non-zero number.
// 1) The exponent and the significand are zero. This is zero.
// 2) The exponent is zero, and the significant is non-zero. This is denormalized.
// 3) The exponent is the maximum value, and the significand is zero. This is infinity.
// 4) The exponent is the maximum value, and the significand is non-zero. This is NaN.
// We check for cases 2, 3, 4 and return 0 if it happens
if((value & 0x7FF0000000000000ULL) == (0x7FF0000000000000ULL))
{
// inifinity or NaN.
return 0;
}// if the exponent is the maximum value
else if((value & 0x7FF0000000000000ULL) == 0)
{
// Check for denormalized number
if(value & 0x000FFFFFFFFFFFFFULL)
return 0;
}// else if the exponent is zero
// If we get here then its a valid float
return 1;
}// isFloat64Valid
/*!
* Convert a 32-bit floating point value (IEEE-754 binary32) to 24-bit floating
* point representation with a variable number of bits for the significand.
* Underflow will be returned as zero and overflow as the maximum possible value.
* \param value is the 32-bit floating point data to convert.
* \param sigbits is the number of bits to use for the significand, and must be
* between 4 and 20 bits inclusive.
* \return The float24 as a simple 24-bit integer (most significant byte clear).
*/
uint32_t float32ToFloat24(float value, int sigbits)
{
union
{
float Float;
uint32_t Integer;
}field;
uint32_t significand;
uint32_t unsignedExponent;
int32_t signedExponent;
uint32_t output;
uint32_t maxsignificand = (1 << sigbits) - 1;
// The bias is computed as 2 raised to the number of exponent bits divided
// by two, minus 1. This can be simplified as 2^(exponent bits -1) - 1
// The number of exponent bits is 24 - 1 - sigbits
int bias = (1 << (22 - sigbits)) - 1;
// Write the floating point value to our union so we can access its bits.
// Note that C99 and C++2011 have built in goodness for this sort of
// thing, but not all compilers support that (sigh...)
field.Float = value;
// The significand is the least significant 23 bits (IEEE754)
significand = field.Integer & 0x007FFFFFul;
// Exponent occupies the next 8 bits (IEEE754)
unsignedExponent = (field.Integer & 0x7F800000ul) >> 23;
// Get rid of some bits; here is where we sacrifice resolution, so we need
// to round correctly, for that reason we do not shift the last bit yet
output = significand >> (22-sigbits);
// Check for rounding, if the most significant bit that we are going to
// throw away is a 1 we need to round up.
if(output & 0x01)
{
output = (output >> 1);
// Don't round past the maximum significand
if(output < maxsignificand)
output++;
}
else
output = (output >> 1);
// If significand and exponent are zero means a number of zero
if((output == 0) && (unsignedExponent == 0))
{
// return correctly signed result
if(field.Integer & 0x80000000ul)
return 0x00800000ul;
else
return 0;
}
// Get the un-biased exponent. Binary32 is biased by 127
signedExponent = unsignedExponent - 127;
if(signedExponent < -bias)
output = 0; // underflow to zero
else
{
if(signedExponent > bias)
{
// Largest possible exponent and significand without making a NaN or Inf
signedExponent = bias;
output = (uint32_t)(1 << sigbits) - 1;
}
// re-bias with the new bias
unsignedExponent = (uint32_t)(signedExponent + bias);
// Put the exponent in the output
output |= (uint32_t)(unsignedExponent << sigbits);
}
// Account for the sign
if(field.Integer & 0x80000000ul)
output |= 0x00800000ul;
// return the 24-bit representation
return output;
}// float32ToFloat24ex
/*!
* Convert a 24-bit floating point representation with variable number of
* significand bits to binary32
* \param value is the float16 representation to convert.
* \param sigbits is the number of bits to use for the significand of the
* 24-bit float, and must be between 4 and 20 bits inclusive.
* \return the binary32 version as a float.
*/
float float24ToFloat32(uint32_t value, int sigbits)
{
union
{
float Float;
uint32_t Integer;
}field;
// Zero is a special case
if((value & 0x007FFFFFul) == 0)
{
field.Integer = 0;
}
else
{
// The mask for the significand bits
int sigmask = (1 << sigbits) - 1;
// The unsigned exponent, mask off the leading sign bit
uint32_t unsignedExponent = ((value & 0x007FFFFFul) >> sigbits);
// The bias is computed as 2 raised to the number of exponent bits divided
// by two, minus 1. This can be simplified as 2^(exponent bits -1) - 1
// The number of exponent bits is 24 - 1 - sigbits
int bias = (1 << (22 - sigbits)) - 1;
// We want to subtract our bias to get un-biased, and then add 127 for the new bias
unsignedExponent += (127 - bias);
// Reduced bits of signficand, shift it up to 23 bits
field.Integer = (value & sigmask) << (23-sigbits);
// Put the exponent in
field.Integer |= (unsignedExponent << 23);
}
// And the sign bit
if(value & 0x00800000ul)
field.Integer |= 0x80000000ul;
return field.Float;
}// float24ToFloat32
/*!
* Convert a 32-bit floating point value (IEEE-754 binary32) to 16-bit floating
* point representation with a variable number of bits for the significand.
* Underflow will be returned as zero and overflow as the maximum possible value.
* \param value is the 32-bit floating point data to convert.
* \param sigbits is the number of bits to use for the significand, and must be
* between 4 and 12 bits inclusive.
* \return The float16 as a simple 16-bit integer.
*/
uint16_t float32ToFloat16(float value, int sigbits)
{
union
{
float Float;
uint32_t Integer;
}field;
uint32_t significand;
uint32_t unsignedExponent;
int32_t signedExponent;
uint16_t output;
uint16_t maxsignificand = (uint16_t)((1 << sigbits) - 1);
// The bias is computed as 2 raised to the number of exponent bits divided
// by two, minus 1. This can be simplified as 2^(exponent bits -1) - 1
// The number of exponent bits is 16 - 1 - sigbits
int bias = (1 << (14 - sigbits)) - 1;
// Write the floating point value to our union so we can access its bits.
// Note that C99 and C++2011 have built in goodness for this sort of
// thing, but not all compilers support that (sigh...)
field.Float = value;
// The significand is the least significant 23 bits (IEEE754)
significand = field.Integer & 0x007FFFFFul;
// Exponent occupies the next 8 bits (IEEE754)
unsignedExponent = (field.Integer & 0x7F800000ul) >> 23;
// Get rid of some bits; here is where we sacrifice resolution, so we need
// to round correctly, for that reason we do not shift the last bit yet
output = (uint16_t)(significand >> (22-sigbits));
// Check for rounding, if the most significant bit that we are going to
// throw away is a 1 we need to round up.
if(output & 0x01)
{
output = (output >> 1);
// Don't round past the maximum significand
if(output < maxsignificand)
output++;
}
else
output = (output >> 1);
// If significand and exponent are zero means a number of zero
if((output == 0) && (unsignedExponent == 0))
{
// return correctly signed result
if(field.Integer & 0x80000000ul)
return 0x8000;
else
return 0;
}
// Get the un-biased exponent. Binary32 is biased by 127
signedExponent = unsignedExponent - 127;
// With a 6-bit exponent we can support exponents of
// exponent : biased value
// -31 : 0 (value is zero or denormalized)
// -30 : 1
// -1 : 30
// 0 : 31
// 1 : 32
// 31 : 62
// 32 : NaN (all exponent bits are 1)
// With a 5-bit exponent we get
// exponent : biased value
// -15 : 0 (value is zero or denormalized)
// -14 : 1
// -1 : 14
// 0 : 15
// 1 : 16
// 15 : 32
// 16 : NaN (all exponent bits are 1)
if(signedExponent < -bias)
output = 0; // underflow to zero
else
{
if(signedExponent > bias)
{
// Largest possible exponent and significand without making a NaN or Inf
signedExponent = bias;
output = (uint16_t)(1 << sigbits) - 1;
}
// re-bias with the new bias
unsignedExponent = (uint32_t)(signedExponent + bias);
// Put the exponent in the output
output |= (uint16_t)(unsignedExponent << sigbits);
}
// Account for the sign
if(field.Integer & 0x80000000ul)
output |= 0x8000;
// return the binary16 representation
return output;
}// float32ToFloat16
/*!
* Convert a 16-bit floating point representation with variable number of
* significand bits to binary32
* \param value is the float16 representation to convert.
* \param sigbits is the number of bits to use for the significand of the
* 16-bit float, and must be between 4 and 12 bits inclusive.
* \return the binary32 version as a float.
*/
float float16ToFloat32(uint16_t value, int sigbits)
{
union
{
float Float;
uint32_t Integer;
}field;
// Zero is a special case
if((value & 0x7FFF) == 0)
{
field.Integer = 0;
}
else
{
// The mask for the significand bits
int sigmask = (1 << sigbits) - 1;
// The unsigned exponent, mask off the leading sign bit
uint32_t unsignedExponent = ((value & 0x7FFF) >> sigbits);
// The bias is computed as 2 raised to the number of exponent bits divided
// by two, minus 1. This can be simplified as 2^(exponent bits -1) - 1
// The number of exponent bits is 16 - 1 - sigbits
int bias = (1 << (14 - sigbits)) - 1;
// We want to subtract our bias to get un-biased, and then add 127 for the new bias
unsignedExponent += (127 - bias);
// Reduced bits of signficand, shift it up to 23 bits
field.Integer = (value & sigmask) << (23-sigbits);
// Put the exponent in
field.Integer |= (unsignedExponent << 23);
}
// And the sign bit
if(value & 0x8000)
field.Integer |= 0x80000000ul;
return field.Float;
}// float16ToFloat32
/*!
* Use this routine (and a debugger) to verify the special float functionality
* \return 1 if test passed
*/
int testSpecialFloat(void)
{
int i;
float dataIn[6], dataOut16[6], dataOut24[6];
union
{
float Float;
uint32_t Integer;
}test;
float error = 0;
test.Float = -.123456789f;
for(i = 0; i < 3; i++)
{
test.Float *= 10.0f;
dataIn[i] = test.Float;
dataOut16[i] = float16ToFloat32(float32ToFloat16(dataIn[i], 9), 9);
dataOut24[i] = float24ToFloat32(float32ToFloat24(dataIn[i], 15), 15);
error += (float)fabs((dataIn[i] - dataOut16[i])/dataIn[i]);
error += (float)fabs((dataIn[i] - dataOut24[i])/dataIn[i]);
}
test.Float = 12.3456789f;
for(;i < 6; i++)
{
test.Float /= 10.0f;
dataIn[i] = test.Float;
dataOut16[i] = float16ToFloat32(float32ToFloat16(dataIn[i], 9), 9);
dataOut24[i] = float24ToFloat32(float32ToFloat24(dataIn[i], 15), 15);
error += (float)fabs((dataIn[i] - dataOut16[i])/dataIn[i]);
error += (float)fabs((dataIn[i] - dataOut24[i])/dataIn[i]);
}
if(error > 0.01f)
return 0;
// Test rounding
test.Float = float16ToFloat32(float32ToFloat16(33.34f, 10), 10);
if(test.Float < 33.34f)
return 0;
test.Float = float16ToFloat32(float32ToFloat16(33.32f, 10), 10);
if(test.Float > 33.32f)
return 0;
// Maximum possible float without Inf or Nan
test.Integer = 0x7F7FFFFF;
// This loop exercises the overflow and underflow, use the debugger to verify functionality
for(i = 0; i < 6; i++)
{
dataIn[i] = test.Float;
dataOut16[i] = float16ToFloat32(float32ToFloat16(dataIn[i], 9), 9);
dataOut24[i] = float24ToFloat32(float32ToFloat24(dataIn[i], 15), 15);
test.Float /= 1000000000000.0f;
}
return 1;
}// testSpecialFloat)===");
source.makeLineSeparator();
return source.flush();
}// ProtocolFloatSpecial::generateSource