2022-10-13 17:30:19 -03:00
|
|
|
#include "float16.h"
|
|
|
|
/*
|
|
|
|
float16 implementation
|
|
|
|
|
2024-08-22 00:05:25 -03:00
|
|
|
Note that this is IEEE half-precision 16-bit float, *not* bfloat16
|
|
|
|
|
2022-10-13 17:30:19 -03:00
|
|
|
algorithm with thanks to libcanard:
|
|
|
|
https://github.com/dronecan/libcanard
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
float Float16_t::get(void) const
|
|
|
|
{
|
|
|
|
union FP32 {
|
|
|
|
uint32_t u;
|
|
|
|
float f;
|
|
|
|
};
|
|
|
|
const union FP32 magic = { (254UL - 15UL) << 23U };
|
|
|
|
const union FP32 was_inf_nan = { (127UL + 16UL) << 23U };
|
|
|
|
union FP32 out;
|
|
|
|
|
|
|
|
out.u = (v16 & 0x7FFFU) << 13U;
|
|
|
|
out.f *= magic.f;
|
|
|
|
if (out.f >= was_inf_nan.f) {
|
|
|
|
out.u |= 255UL << 23U;
|
|
|
|
}
|
|
|
|
out.u |= (v16 & 0x8000UL) << 16U;
|
|
|
|
|
|
|
|
return out.f;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Float16_t::set(float value)
|
|
|
|
{
|
|
|
|
union FP32
|
|
|
|
{
|
|
|
|
uint32_t u;
|
|
|
|
float f;
|
|
|
|
};
|
|
|
|
|
|
|
|
const union FP32 f32inf = { 255UL << 23U };
|
|
|
|
const union FP32 f16inf = { 31UL << 23U };
|
|
|
|
const union FP32 magic = { 15UL << 23U };
|
|
|
|
const uint32_t sign_mask = 0x80000000UL;
|
|
|
|
const uint32_t round_mask = 0xFFFFF000UL;
|
|
|
|
|
|
|
|
union FP32 in;
|
|
|
|
in.f = value;
|
|
|
|
uint32_t sign = in.u & sign_mask;
|
|
|
|
in.u ^= sign;
|
|
|
|
|
|
|
|
v16 = 0;
|
|
|
|
|
|
|
|
if (in.u >= f32inf.u)
|
|
|
|
{
|
|
|
|
v16 = (in.u > f32inf.u) ? (uint16_t)0x7FFFU : (uint16_t)0x7C00U;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
in.u &= round_mask;
|
|
|
|
in.f *= magic.f;
|
|
|
|
in.u -= round_mask;
|
|
|
|
if (in.u > f16inf.u)
|
|
|
|
{
|
|
|
|
in.u = f16inf.u;
|
|
|
|
}
|
|
|
|
v16 = (uint16_t)(in.u >> 13U);
|
|
|
|
}
|
|
|
|
|
|
|
|
v16 |= (uint16_t)(sign >> 16U);
|
|
|
|
}
|