cpython/Include/internal/pycore_bitutils.h

/* Bit and bytes utilities.

   Bytes swap functions, reverse order of bytes:

   - _Py_bswap16(uint16_t)
   - _Py_bswap32(uint32_t)
   - _Py_bswap64(uint64_t)
*/

#ifndef Py_INTERNAL_BITUTILS_H
#define Py_INTERNAL_BITUTILS_H
#ifdef __cplusplus
extern "C" {
#endif

#ifndef Py_BUILD_CORE
#  error "this header requires Py_BUILD_CORE define"
#endif

#if defined(__GNUC__) \
      && ((__GNUC__ >= 5) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8))
   /* __builtin_bswap16() is available since GCC 4.8,
      __builtin_bswap32() is available since GCC 4.3,
      __builtin_bswap64() is available since GCC 4.3. */
#  define _PY_HAVE_BUILTIN_BSWAP
#endif

#ifdef _MSC_VER
#  include <intrin.h>             // _byteswap_uint64()
#endif


static inline uint16_t
_Py_bswap16(uint16_t word)
{
#if defined(_PY_HAVE_BUILTIN_BSWAP) || _Py__has_builtin(__builtin_bswap16)
    return __builtin_bswap16(word);
#elif defined(_MSC_VER)
    Py_BUILD_ASSERT(sizeof(word) == sizeof(unsigned short));
    return _byteswap_ushort(word);
#else
    // Portable implementation which doesn't rely on circular bit shift
    return ( ((word & UINT16_C(0x00FF)) << 8)
           | ((word & UINT16_C(0xFF00)) >> 8));
#endif
}

static inline uint32_t
_Py_bswap32(uint32_t word)
{
#if defined(_PY_HAVE_BUILTIN_BSWAP) || _Py__has_builtin(__builtin_bswap32)
    return __builtin_bswap32(word);
#elif defined(_MSC_VER)
    Py_BUILD_ASSERT(sizeof(word) == sizeof(unsigned long));
    return _byteswap_ulong(word);
#else
    // Portable implementation which doesn't rely on circular bit shift
    return ( ((word & UINT32_C(0x000000FF)) << 24)
           | ((word & UINT32_C(0x0000FF00)) <<  8)
           | ((word & UINT32_C(0x00FF0000)) >>  8)
           | ((word & UINT32_C(0xFF000000)) >> 24));
#endif
}

static inline uint64_t
_Py_bswap64(uint64_t word)
{
#if defined(_PY_HAVE_BUILTIN_BSWAP) || _Py__has_builtin(__builtin_bswap64)
    return __builtin_bswap64(word);
#elif defined(_MSC_VER)
    return _byteswap_uint64(word);
#else
    // Portable implementation which doesn't rely on circular bit shift
    return ( ((word & UINT64_C(0x00000000000000FF)) << 56)
           | ((word & UINT64_C(0x000000000000FF00)) << 40)
           | ((word & UINT64_C(0x0000000000FF0000)) << 24)
           | ((word & UINT64_C(0x00000000FF000000)) <<  8)
           | ((word & UINT64_C(0x000000FF00000000)) >>  8)
           | ((word & UINT64_C(0x0000FF0000000000)) >> 24)
           | ((word & UINT64_C(0x00FF000000000000)) >> 40)
           | ((word & UINT64_C(0xFF00000000000000)) >> 56));
#endif
}


// Population count: count the number of 1's in 'x'
// (number of bits set to 1), also known as the hamming weight.
//
// Implementation note. CPUID is not used, to test if x86 POPCNT instruction
// can be used, to keep the implementation simple. For example, Visual Studio
// __popcnt() is not used this reason. The clang and GCC builtin function can
// use the x86 POPCNT instruction if the target architecture has SSE4a or
// newer.
static inline int
_Py_popcount32(uint32_t x)
{
#if (defined(__clang__) || defined(__GNUC__))

#if SIZEOF_INT >= 4
    Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned int));
    return __builtin_popcount(x);
#else
    // The C standard guarantees that unsigned long will always be big enough
    // to hold a uint32_t value without losing information.
    Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned long));
    return __builtin_popcountl(x);
#endif

#else
    // 32-bit SWAR (SIMD Within A Register) popcount

    // Binary: 0 1 0 1 ...
    const uint32_t M1 = 0x55555555;
    // Binary: 00 11 00 11. ..
    const uint32_t M2 = 0x33333333;
    // Binary: 0000 1111 0000 1111 ...
    const uint32_t M4 = 0x0F0F0F0F;

    // Put count of each 2 bits into those 2 bits
    x = x - ((x >> 1) & M1);
    // Put count of each 4 bits into those 4 bits
    x = (x & M2) + ((x >> 2) & M2);
    // Put count of each 8 bits into those 8 bits
    x = (x + (x >> 4)) & M4;
    // Sum of the 4 byte counts.
    // Take care when considering changes to the next line. Portability and
    // correctness are delicate here, thanks to C's "integer promotions" (C99
    // §6.3.1.1p2). On machines where the `int` type has width greater than 32
    // bits, `x` will be promoted to an `int`, and following C's "usual
    // arithmetic conversions" (C99 §6.3.1.8), the multiplication will be
    // performed as a multiplication of two `unsigned int` operands. In this
    // case it's critical that we cast back to `uint32_t` in order to keep only
    // the least significant 32 bits. On machines where the `int` type has
    // width no greater than 32, the multiplication is of two 32-bit unsigned
    // integer types, and the (uint32_t) cast is a no-op. In both cases, we
    // avoid the risk of undefined behaviour due to overflow of a
    // multiplication of signed integer types.
    return (uint32_t)(x * 0x01010101U) >> 24;
#endif
}


// Return the index of the most significant 1 bit in 'x'. This is the smallest
// integer k such that x < 2**k. Equivalent to floor(log2(x)) + 1 for x != 0.
static inline int
_Py_bit_length(unsigned long x)
{
#if (defined(__clang__) || defined(__GNUC__))
    if (x != 0) {
        // __builtin_clzl() is available since GCC 3.4.
        // Undefined behavior for x == 0.
        return (int)sizeof(unsigned long) * 8 - __builtin_clzl(x);
    }
    else {
        return 0;
    }
#elif defined(_MSC_VER)
    // _BitScanReverse() is documented to search 32 bits.
    Py_BUILD_ASSERT(sizeof(unsigned long) <= 4);
    unsigned long msb;
    if (_BitScanReverse(&msb, x)) {
        return (int)msb + 1;
    }
    else {
        return 0;
    }
#else
    const int BIT_LENGTH_TABLE[32] = {
        0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
    };
    int msb = 0;
    while (x >= 32) {
        msb += 6;
        x >>= 6;
    }
    msb += BIT_LENGTH_TABLE[x];
    return msb;
#endif
}


#ifdef __cplusplus
}
#endif
#endif /* !Py_INTERNAL_BITUTILS_H */
bpo-29882: Add _Py_popcount32() function (GH-20518) * Rename pycore_byteswap.h to pycore_bitutils.h. * Move popcount_digit() to pycore_bitutils.h as _Py_popcount32(). * _Py_popcount32() uses GCC and clang builtin function if available. * Add unit tests to _Py_popcount32(). 2020-06-08 11:30:33 -03:00			`/* Bit and bytes utilities.`

			`Bytes swap functions, reverse order of bytes:`
bpo-40302: Add pycore_byteswap.h header file (GH-19552) Add a new internal pycore_byteswap.h header file with the following functions: * _Py_bswap16() * _Py_bswap32() * _Py_bswap64() Use these functions in _ctypes, sha256 and sha512 modules, and also use in the UTF-32 encoder. sha256, sha512 and _ctypes modules are now built with the internal C API. 2020-04-17 12:47:20 -03:00
			`- _Py_bswap16(uint16_t)`
			`- _Py_bswap32(uint32_t)`
			`- _Py_bswap64(uint64_t)`
			`*/`

bpo-29782: Consolidate _Py_Bit_Length() (GH-20739) In GH-2866, _Py_Bit_Length() was added to pymath.h for lack of a better location. GH-20518 added a more appropriate header file for bit utilities. It also shows how to properly use intrinsics. This allows reconsidering bpo-29782. * Move the function to the new header. * Changed return type to match __builtin_clzl() and reviewed usage. * Use intrinsics where available. * Pick a fallback implementation suitable for inlining. 2020-06-15 09:33:48 -03:00			`#ifndef Py_INTERNAL_BITUTILS_H`
			`#define Py_INTERNAL_BITUTILS_H`
bpo-40302: Add pycore_byteswap.h header file (GH-19552) Add a new internal pycore_byteswap.h header file with the following functions: * _Py_bswap16() * _Py_bswap32() * _Py_bswap64() Use these functions in _ctypes, sha256 and sha512 modules, and also use in the UTF-32 encoder. sha256, sha512 and _ctypes modules are now built with the internal C API. 2020-04-17 12:47:20 -03:00			`#ifdef __cplusplus`
			`extern "C" {`
			`#endif`

			`#ifndef Py_BUILD_CORE`
			`# error "this header requires Py_BUILD_CORE define"`
			`#endif`

bpo-41617: Add _Py__has_builtin() macro (GH-23260) Fix building pycore_bitutils.h internal header on old clang version without __builtin_bswap16() (ex: Xcode 4.6.3 on Mac OS X 10.7). Add a new private _Py__has_builtin() macro to check for availability of a preprocessor builtin function. Co-Authored-By: Joshua Root <jmr@macports.org> Co-authored-by: Joshua Root <jmr@macports.org> 2020-11-13 10:38:17 -04:00			`#if defined(__GNUC__) \`
			`&& ((__GNUC__ >= 5) \|\| (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8))`
			`/* __builtin_bswap16() is available since GCC 4.8,`
bpo-40302: Add pycore_byteswap.h header file (GH-19552) Add a new internal pycore_byteswap.h header file with the following functions: * _Py_bswap16() * _Py_bswap32() * _Py_bswap64() Use these functions in _ctypes, sha256 and sha512 modules, and also use in the UTF-32 encoder. sha256, sha512 and _ctypes modules are now built with the internal C API. 2020-04-17 12:47:20 -03:00			`__builtin_bswap32() is available since GCC 4.3,`
			`__builtin_bswap64() is available since GCC 4.3. */`
			`# define _PY_HAVE_BUILTIN_BSWAP`
			`#endif`

			`#ifdef _MSC_VER`
gh-108216: Cleanup #include in internal header files (#108228) * Add missing includes. * Remove unused includes. * Update old include/symbol names to newer names. * Mention at least one included symbol. * Sort includes. * Update Tools/cases_generator/generate_cases.py used to generated pycore_opcode_metadata.h. * Update Parser/asdl_c.py used to generate pycore_ast.h. * Cleanup also includes in _testcapimodule.c and _testinternalcapi.c. 2023-08-21 15:05:59 -03:00			`# include <intrin.h> // _byteswap_uint64()`
bpo-40302: Add pycore_byteswap.h header file (GH-19552) Add a new internal pycore_byteswap.h header file with the following functions: * _Py_bswap16() * _Py_bswap32() * _Py_bswap64() Use these functions in _ctypes, sha256 and sha512 modules, and also use in the UTF-32 encoder. sha256, sha512 and _ctypes modules are now built with the internal C API. 2020-04-17 12:47:20 -03:00			`#endif`

gh-108216: Cleanup #include in internal header files (#108228) * Add missing includes. * Remove unused includes. * Update old include/symbol names to newer names. * Mention at least one included symbol. * Sort includes. * Update Tools/cases_generator/generate_cases.py used to generated pycore_opcode_metadata.h. * Update Parser/asdl_c.py used to generate pycore_ast.h. * Cleanup also includes in _testcapimodule.c and _testinternalcapi.c. 2023-08-21 15:05:59 -03:00
bpo-40302: Add pycore_byteswap.h header file (GH-19552) Add a new internal pycore_byteswap.h header file with the following functions: * _Py_bswap16() * _Py_bswap32() * _Py_bswap64() Use these functions in _ctypes, sha256 and sha512 modules, and also use in the UTF-32 encoder. sha256, sha512 and _ctypes modules are now built with the internal C API. 2020-04-17 12:47:20 -03:00			`static inline uint16_t`
			`_Py_bswap16(uint16_t word)`
			`{`
bpo-41617: Add _Py__has_builtin() macro (GH-23260) Fix building pycore_bitutils.h internal header on old clang version without __builtin_bswap16() (ex: Xcode 4.6.3 on Mac OS X 10.7). Add a new private _Py__has_builtin() macro to check for availability of a preprocessor builtin function. Co-Authored-By: Joshua Root <jmr@macports.org> Co-authored-by: Joshua Root <jmr@macports.org> 2020-11-13 10:38:17 -04:00			`#if defined(_PY_HAVE_BUILTIN_BSWAP) \|\| _Py__has_builtin(__builtin_bswap16)`
bpo-40302: Add pycore_byteswap.h header file (GH-19552) Add a new internal pycore_byteswap.h header file with the following functions: * _Py_bswap16() * _Py_bswap32() * _Py_bswap64() Use these functions in _ctypes, sha256 and sha512 modules, and also use in the UTF-32 encoder. sha256, sha512 and _ctypes modules are now built with the internal C API. 2020-04-17 12:47:20 -03:00			`return __builtin_bswap16(word);`
			`#elif defined(_MSC_VER)`
			`Py_BUILD_ASSERT(sizeof(word) == sizeof(unsigned short));`
			`return _byteswap_ushort(word);`
			`#else`
			`// Portable implementation which doesn't rely on circular bit shift`
			`return ( ((word & UINT16_C(0x00FF)) << 8)`
			`\| ((word & UINT16_C(0xFF00)) >> 8));`
			`#endif`
			`}`

			`static inline uint32_t`
			`_Py_bswap32(uint32_t word)`
			`{`
bpo-41617: Add _Py__has_builtin() macro (GH-23260) Fix building pycore_bitutils.h internal header on old clang version without __builtin_bswap16() (ex: Xcode 4.6.3 on Mac OS X 10.7). Add a new private _Py__has_builtin() macro to check for availability of a preprocessor builtin function. Co-Authored-By: Joshua Root <jmr@macports.org> Co-authored-by: Joshua Root <jmr@macports.org> 2020-11-13 10:38:17 -04:00			`#if defined(_PY_HAVE_BUILTIN_BSWAP) \|\| _Py__has_builtin(__builtin_bswap32)`
bpo-40302: Add pycore_byteswap.h header file (GH-19552) Add a new internal pycore_byteswap.h header file with the following functions: * _Py_bswap16() * _Py_bswap32() * _Py_bswap64() Use these functions in _ctypes, sha256 and sha512 modules, and also use in the UTF-32 encoder. sha256, sha512 and _ctypes modules are now built with the internal C API. 2020-04-17 12:47:20 -03:00			`return __builtin_bswap32(word);`
			`#elif defined(_MSC_VER)`
			`Py_BUILD_ASSERT(sizeof(word) == sizeof(unsigned long));`
			`return _byteswap_ulong(word);`
			`#else`
			`// Portable implementation which doesn't rely on circular bit shift`
			`return ( ((word & UINT32_C(0x000000FF)) << 24)`
			`\| ((word & UINT32_C(0x0000FF00)) << 8)`
			`\| ((word & UINT32_C(0x00FF0000)) >> 8)`
			`\| ((word & UINT32_C(0xFF000000)) >> 24));`
			`#endif`
			`}`

			`static inline uint64_t`
			`_Py_bswap64(uint64_t word)`
			`{`
bpo-41617: Add _Py__has_builtin() macro (GH-23260) Fix building pycore_bitutils.h internal header on old clang version without __builtin_bswap16() (ex: Xcode 4.6.3 on Mac OS X 10.7). Add a new private _Py__has_builtin() macro to check for availability of a preprocessor builtin function. Co-Authored-By: Joshua Root <jmr@macports.org> Co-authored-by: Joshua Root <jmr@macports.org> 2020-11-13 10:38:17 -04:00			`#if defined(_PY_HAVE_BUILTIN_BSWAP) \|\| _Py__has_builtin(__builtin_bswap64)`
bpo-40302: Add pycore_byteswap.h header file (GH-19552) Add a new internal pycore_byteswap.h header file with the following functions: * _Py_bswap16() * _Py_bswap32() * _Py_bswap64() Use these functions in _ctypes, sha256 and sha512 modules, and also use in the UTF-32 encoder. sha256, sha512 and _ctypes modules are now built with the internal C API. 2020-04-17 12:47:20 -03:00			`return __builtin_bswap64(word);`
			`#elif defined(_MSC_VER)`
			`return _byteswap_uint64(word);`
			`#else`
			`// Portable implementation which doesn't rely on circular bit shift`
			`return ( ((word & UINT64_C(0x00000000000000FF)) << 56)`
			`\| ((word & UINT64_C(0x000000000000FF00)) << 40)`
			`\| ((word & UINT64_C(0x0000000000FF0000)) << 24)`
			`\| ((word & UINT64_C(0x00000000FF000000)) << 8)`
			`\| ((word & UINT64_C(0x000000FF00000000)) >> 8)`
			`\| ((word & UINT64_C(0x0000FF0000000000)) >> 24)`
			`\| ((word & UINT64_C(0x00FF000000000000)) >> 40)`
			`\| ((word & UINT64_C(0xFF00000000000000)) >> 56));`
			`#endif`
			`}`


bpo-29882: Add _Py_popcount32() function (GH-20518) * Rename pycore_byteswap.h to pycore_bitutils.h. * Move popcount_digit() to pycore_bitutils.h as _Py_popcount32(). * _Py_popcount32() uses GCC and clang builtin function if available. * Add unit tests to _Py_popcount32(). 2020-06-08 11:30:33 -03:00			`// Population count: count the number of 1's in 'x'`
			`// (number of bits set to 1), also known as the hamming weight.`
			`//`
			`// Implementation note. CPUID is not used, to test if x86 POPCNT instruction`
			`// can be used, to keep the implementation simple. For example, Visual Studio`
			`// __popcnt() is not used this reason. The clang and GCC builtin function can`
			`// use the x86 POPCNT instruction if the target architecture has SSE4a or`
			`// newer.`
			`static inline int`
			`_Py_popcount32(uint32_t x)`
			`{`
			`#if (defined(__clang__) \|\| defined(__GNUC__))`

			`#if SIZEOF_INT >= 4`
			`Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned int));`
			`return __builtin_popcount(x);`
			`#else`
			`// The C standard guarantees that unsigned long will always be big enough`
			`// to hold a uint32_t value without losing information.`
			`Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned long));`
			`return __builtin_popcountl(x);`
			`#endif`

			`#else`
			`// 32-bit SWAR (SIMD Within A Register) popcount`

			`// Binary: 0 1 0 1 ...`
			`const uint32_t M1 = 0x55555555;`
			`// Binary: 00 11 00 11. ..`
			`const uint32_t M2 = 0x33333333;`
			`// Binary: 0000 1111 0000 1111 ...`
			`const uint32_t M4 = 0x0F0F0F0F;`

			`// Put count of each 2 bits into those 2 bits`
			`x = x - ((x >> 1) & M1);`
			`// Put count of each 4 bits into those 4 bits`
			`x = (x & M2) + ((x >> 2) & M2);`
			`// Put count of each 8 bits into those 8 bits`
			`x = (x + (x >> 4)) & M4;`
bpo-29882: Fix portability bug introduced in GH-30774 (#30794) 2022-01-23 05:59:34 -04:00			`// Sum of the 4 byte counts.`
			`// Take care when considering changes to the next line. Portability and`
			`// correctness are delicate here, thanks to C's "integer promotions" (C99`
			// §6.3.1.1p2). On machines where the `int` type has width greater than 32
			// bits, `x` will be promoted to an `int`, and following C's "usual
			`// arithmetic conversions" (C99 §6.3.1.8), the multiplication will be`
			// performed as a multiplication of two `unsigned int` operands. In this
			// case it's critical that we cast back to `uint32_t` in order to keep only
			// the least significant 32 bits. On machines where the `int` type has
			`// width no greater than 32, the multiplication is of two 32-bit unsigned`
			`// integer types, and the (uint32_t) cast is a no-op. In both cases, we`
			`// avoid the risk of undefined behaviour due to overflow of a`
			`// multiplication of signed integer types.`
			`return (uint32_t)(x * 0x01010101U) >> 24;`
bpo-29882: Add _Py_popcount32() function (GH-20518) * Rename pycore_byteswap.h to pycore_bitutils.h. * Move popcount_digit() to pycore_bitutils.h as _Py_popcount32(). * _Py_popcount32() uses GCC and clang builtin function if available. * Add unit tests to _Py_popcount32(). 2020-06-08 11:30:33 -03:00			`#endif`
			`}`


bpo-29782: Consolidate _Py_Bit_Length() (GH-20739) In GH-2866, _Py_Bit_Length() was added to pymath.h for lack of a better location. GH-20518 added a more appropriate header file for bit utilities. It also shows how to properly use intrinsics. This allows reconsidering bpo-29782. * Move the function to the new header. * Changed return type to match __builtin_clzl() and reviewed usage. * Use intrinsics where available. * Pick a fallback implementation suitable for inlining. 2020-06-15 09:33:48 -03:00			`// Return the index of the most significant 1 bit in 'x'. This is the smallest`
			`// integer k such that x < 2**k. Equivalent to floor(log2(x)) + 1 for x != 0.`
			`static inline int`
			`_Py_bit_length(unsigned long x)`
			`{`
			`#if (defined(__clang__) \|\| defined(__GNUC__))`
			`if (x != 0) {`
			`// __builtin_clzl() is available since GCC 3.4.`
			`// Undefined behavior for x == 0.`
			`return (int)sizeof(unsigned long) * 8 - __builtin_clzl(x);`
			`}`
			`else {`
			`return 0;`
			`}`
			`#elif defined(_MSC_VER)`
			`// _BitScanReverse() is documented to search 32 bits.`
			`Py_BUILD_ASSERT(sizeof(unsigned long) <= 4);`
			`unsigned long msb;`
			`if (_BitScanReverse(&msb, x)) {`
			`return (int)msb + 1;`
			`}`
			`else {`
			`return 0;`
			`}`
			`#else`
			`const int BIT_LENGTH_TABLE[32] = {`
			`0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,`
			`5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5`
			`};`
			`int msb = 0;`
			`while (x >= 32) {`
			`msb += 6;`
			`x >>= 6;`
			`}`
			`msb += BIT_LENGTH_TABLE[x];`
			`return msb;`
			`#endif`
			`}`


bpo-40302: Add pycore_byteswap.h header file (GH-19552) Add a new internal pycore_byteswap.h header file with the following functions: * _Py_bswap16() * _Py_bswap32() * _Py_bswap64() Use these functions in _ctypes, sha256 and sha512 modules, and also use in the UTF-32 encoder. sha256, sha512 and _ctypes modules are now built with the internal C API. 2020-04-17 12:47:20 -03:00			`#ifdef __cplusplus`
			`}`
			`#endif`
bpo-29782: Consolidate _Py_Bit_Length() (GH-20739) In GH-2866, _Py_Bit_Length() was added to pymath.h for lack of a better location. GH-20518 added a more appropriate header file for bit utilities. It also shows how to properly use intrinsics. This allows reconsidering bpo-29782. * Move the function to the new header. * Changed return type to match __builtin_clzl() and reviewed usage. * Use intrinsics where available. * Pick a fallback implementation suitable for inlining. 2020-06-15 09:33:48 -03:00			`#endif /* !Py_INTERNAL_BITUTILS_H */`