/*
  return 64 bit x / 1000
  faster than the normal gcc implementation using by about 3x
  With thanks to https://0x414b.com/2021/04/16/arm-division.html
  and https://stackoverflow.com/questions/74765410/multiply-two-uint64-ts-and-store-result-to-uint64-t-doesnt-seem-to-work
*/
static inline uint64_t uint64_div1000(uint64_t x)
{
    x >>= 3U;
    uint64_t a_lo = (uint32_t)x;
    uint64_t a_hi = x >> 32;
    const uint64_t b_lo = 0xe353f7cfU;
    const uint64_t b_hi = 0x20c49ba5U;

    uint64_t a_x_b_hi = a_hi * b_hi;
    uint64_t a_x_b_mid = a_hi * b_lo;
    uint64_t b_x_a_mid = b_hi * a_lo;
    uint32_t a_x_b_lo = (a_lo * b_lo)>>32;

    // 64-bit product + two 32-bit values
    uint64_t middle = a_x_b_mid + a_x_b_lo + (uint32_t)b_x_a_mid;

    // 64-bit product + two 32-bit values
    uint64_t r = a_x_b_hi + (middle >> 32) + (b_x_a_mid >> 32);
    return r >> 4U;
}