bpo-29882: Add _Py_popcount32() function (GH-20518)
* Rename pycore_byteswap.h to pycore_bitutils.h. * Move popcount_digit() to pycore_bitutils.h as _Py_popcount32(). * _Py_popcount32() uses GCC and clang builtin function if available. * Add unit tests to _Py_popcount32().
This commit is contained in:
parent
301f0d4ff9
commit
c6b292cdee
|
@ -1,4 +1,6 @@
|
||||||
/* Bytes swap functions, reverse order of bytes:
|
/* Bit and bytes utilities.
|
||||||
|
|
||||||
|
Bytes swap functions, reverse order of bytes:
|
||||||
|
|
||||||
- _Py_bswap16(uint16_t)
|
- _Py_bswap16(uint16_t)
|
||||||
- _Py_bswap32(uint32_t)
|
- _Py_bswap32(uint32_t)
|
||||||
|
@ -82,6 +84,53 @@ _Py_bswap64(uint64_t word)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Population count: count the number of 1's in 'x'
|
||||||
|
// (number of bits set to 1), also known as the hamming weight.
|
||||||
|
//
|
||||||
|
// Implementation note. CPUID is not used, to test if x86 POPCNT instruction
|
||||||
|
// can be used, to keep the implementation simple. For example, Visual Studio
|
||||||
|
// __popcnt() is not used this reason. The clang and GCC builtin function can
|
||||||
|
// use the x86 POPCNT instruction if the target architecture has SSE4a or
|
||||||
|
// newer.
|
||||||
|
static inline int
|
||||||
|
_Py_popcount32(uint32_t x)
|
||||||
|
{
|
||||||
|
#if (defined(__clang__) || defined(__GNUC__))
|
||||||
|
|
||||||
|
#if SIZEOF_INT >= 4
|
||||||
|
Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned int));
|
||||||
|
return __builtin_popcount(x);
|
||||||
|
#else
|
||||||
|
// The C standard guarantees that unsigned long will always be big enough
|
||||||
|
// to hold a uint32_t value without losing information.
|
||||||
|
Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned long));
|
||||||
|
return __builtin_popcountl(x);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else
|
||||||
|
// 32-bit SWAR (SIMD Within A Register) popcount
|
||||||
|
|
||||||
|
// Binary: 0 1 0 1 ...
|
||||||
|
const uint32_t M1 = 0x55555555;
|
||||||
|
// Binary: 00 11 00 11. ..
|
||||||
|
const uint32_t M2 = 0x33333333;
|
||||||
|
// Binary: 0000 1111 0000 1111 ...
|
||||||
|
const uint32_t M4 = 0x0F0F0F0F;
|
||||||
|
// 256**4 + 256**3 + 256**2 + 256**1
|
||||||
|
const uint32_t SUM = 0x01010101;
|
||||||
|
|
||||||
|
// Put count of each 2 bits into those 2 bits
|
||||||
|
x = x - ((x >> 1) & M1);
|
||||||
|
// Put count of each 4 bits into those 4 bits
|
||||||
|
x = (x & M2) + ((x >> 2) & M2);
|
||||||
|
// Put count of each 8 bits into those 8 bits
|
||||||
|
x = (x + (x >> 4)) & M4;
|
||||||
|
// Sum of the 4 byte counts
|
||||||
|
return (uint32_t)((uint64_t)x * (uint64_t)SUM) >> 24;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
|
@ -1121,7 +1121,7 @@ PYTHON_HEADERS= \
|
||||||
$(srcdir)/Include/internal/pycore_abstract.h \
|
$(srcdir)/Include/internal/pycore_abstract.h \
|
||||||
$(srcdir)/Include/internal/pycore_accu.h \
|
$(srcdir)/Include/internal/pycore_accu.h \
|
||||||
$(srcdir)/Include/internal/pycore_atomic.h \
|
$(srcdir)/Include/internal/pycore_atomic.h \
|
||||||
$(srcdir)/Include/internal/pycore_byteswap.h \
|
$(srcdir)/Include/internal/pycore_bitutils.h \
|
||||||
$(srcdir)/Include/internal/pycore_bytes_methods.h \
|
$(srcdir)/Include/internal/pycore_bytes_methods.h \
|
||||||
$(srcdir)/Include/internal/pycore_call.h \
|
$(srcdir)/Include/internal/pycore_call.h \
|
||||||
$(srcdir)/Include/internal/pycore_ceval.h \
|
$(srcdir)/Include/internal/pycore_ceval.h \
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#include "Python.h"
|
#include "Python.h"
|
||||||
#include "pycore_byteswap.h" // _Py_bswap32()
|
#include "pycore_bitutils.h" // _Py_bswap32()
|
||||||
|
|
||||||
#include <ffi.h>
|
#include <ffi.h>
|
||||||
#ifdef MS_WIN32
|
#ifdef MS_WIN32
|
||||||
|
|
|
@ -12,7 +12,7 @@
|
||||||
#define PY_SSIZE_T_CLEAN
|
#define PY_SSIZE_T_CLEAN
|
||||||
|
|
||||||
#include "Python.h"
|
#include "Python.h"
|
||||||
#include "pycore_byteswap.h" // _Py_bswap32()
|
#include "pycore_bitutils.h" // _Py_bswap32()
|
||||||
#include "pycore_initconfig.h" // _Py_GetConfigsAsDict()
|
#include "pycore_initconfig.h" // _Py_GetConfigsAsDict()
|
||||||
#include "pycore_hashtable.h" // _Py_hashtable_new()
|
#include "pycore_hashtable.h" // _Py_hashtable_new()
|
||||||
#include "pycore_gc.h" // PyGC_Head
|
#include "pycore_gc.h" // PyGC_Head
|
||||||
|
@ -63,6 +63,45 @@ test_bswap(PyObject *self, PyObject *Py_UNUSED(args))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static int
|
||||||
|
check_popcount(uint32_t x, int expected)
|
||||||
|
{
|
||||||
|
// Use volatile to prevent the compiler to optimize out the whole test
|
||||||
|
volatile uint32_t u = x;
|
||||||
|
int bits = _Py_popcount32(u);
|
||||||
|
if (bits != expected) {
|
||||||
|
PyErr_Format(PyExc_AssertionError,
|
||||||
|
"_Py_popcount32(%lu) returns %i, expected %i",
|
||||||
|
(unsigned long)x, bits, expected);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static PyObject*
|
||||||
|
test_popcount(PyObject *self, PyObject *Py_UNUSED(args))
|
||||||
|
{
|
||||||
|
#define CHECK(X, RESULT) \
|
||||||
|
do { \
|
||||||
|
if (check_popcount(X, RESULT) < 0) { \
|
||||||
|
return NULL; \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
CHECK(0, 0);
|
||||||
|
CHECK(1, 1);
|
||||||
|
CHECK(0x08080808, 4);
|
||||||
|
CHECK(0x10101010, 4);
|
||||||
|
CHECK(0x10204080, 4);
|
||||||
|
CHECK(0xDEADCAFE, 22);
|
||||||
|
CHECK(0xFFFFFFFF, 32);
|
||||||
|
Py_RETURN_NONE;
|
||||||
|
|
||||||
|
#undef CHECK
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#define TO_PTR(ch) ((void*)(uintptr_t)ch)
|
#define TO_PTR(ch) ((void*)(uintptr_t)ch)
|
||||||
#define FROM_PTR(ptr) ((uintptr_t)ptr)
|
#define FROM_PTR(ptr) ((uintptr_t)ptr)
|
||||||
#define VALUE(key) (1 + ((int)(key) - 'a'))
|
#define VALUE(key) (1 + ((int)(key) - 'a'))
|
||||||
|
@ -157,6 +196,7 @@ static PyMethodDef TestMethods[] = {
|
||||||
{"get_configs", get_configs, METH_NOARGS},
|
{"get_configs", get_configs, METH_NOARGS},
|
||||||
{"get_recursion_depth", get_recursion_depth, METH_NOARGS},
|
{"get_recursion_depth", get_recursion_depth, METH_NOARGS},
|
||||||
{"test_bswap", test_bswap, METH_NOARGS},
|
{"test_bswap", test_bswap, METH_NOARGS},
|
||||||
|
{"test_popcount", test_popcount, METH_NOARGS},
|
||||||
{"test_hashtable", test_hashtable, METH_NOARGS},
|
{"test_hashtable", test_hashtable, METH_NOARGS},
|
||||||
{NULL, NULL} /* sentinel */
|
{NULL, NULL} /* sentinel */
|
||||||
};
|
};
|
||||||
|
|
|
@ -17,7 +17,7 @@
|
||||||
/* SHA objects */
|
/* SHA objects */
|
||||||
|
|
||||||
#include "Python.h"
|
#include "Python.h"
|
||||||
#include "pycore_byteswap.h" // _Py_bswap32()
|
#include "pycore_bitutils.h" // _Py_bswap32()
|
||||||
#include "structmember.h" // PyMemberDef
|
#include "structmember.h" // PyMemberDef
|
||||||
#include "hashlib.h"
|
#include "hashlib.h"
|
||||||
#include "pystrhex.h"
|
#include "pystrhex.h"
|
||||||
|
|
|
@ -17,7 +17,7 @@
|
||||||
/* SHA objects */
|
/* SHA objects */
|
||||||
|
|
||||||
#include "Python.h"
|
#include "Python.h"
|
||||||
#include "pycore_byteswap.h" // _Py_bswap32()
|
#include "pycore_bitutils.h" // _Py_bswap32()
|
||||||
#include "structmember.h" // PyMemberDef
|
#include "structmember.h" // PyMemberDef
|
||||||
#include "hashlib.h"
|
#include "hashlib.h"
|
||||||
#include "pystrhex.h"
|
#include "pystrhex.h"
|
||||||
|
|
|
@ -3,8 +3,9 @@
|
||||||
/* XXX The functional organization of this file is terrible */
|
/* XXX The functional organization of this file is terrible */
|
||||||
|
|
||||||
#include "Python.h"
|
#include "Python.h"
|
||||||
#include "pycore_interp.h" // _PY_NSMALLPOSINTS
|
#include "pycore_bitutils.h" // _Py_popcount32()
|
||||||
#include "pycore_pystate.h" // _Py_IsMainInterpreter()
|
#include "pycore_interp.h" // _PY_NSMALLPOSINTS
|
||||||
|
#include "pycore_pystate.h" // _Py_IsMainInterpreter()
|
||||||
#include "longintrepr.h"
|
#include "longintrepr.h"
|
||||||
|
|
||||||
#include <float.h>
|
#include <float.h>
|
||||||
|
@ -5307,12 +5308,10 @@ int_bit_length_impl(PyObject *self)
|
||||||
static int
|
static int
|
||||||
popcount_digit(digit d)
|
popcount_digit(digit d)
|
||||||
{
|
{
|
||||||
/* 32bit SWAR popcount. */
|
// digit can be larger than uint32_t, but only PyLong_SHIFT bits
|
||||||
uint32_t u = d;
|
// of it will be ever used.
|
||||||
u -= (u >> 1) & 0x55555555U;
|
Py_BUILD_ASSERT(PyLong_SHIFT <= 32);
|
||||||
u = (u & 0x33333333U) + ((u >> 2) & 0x33333333U);
|
return _Py_popcount32((uint32_t)d);
|
||||||
u = (u + (u >> 4)) & 0x0f0f0f0fU;
|
|
||||||
return (uint32_t)(u * 0x01010101U) >> 24;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*[clinic input]
|
/*[clinic input]
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
# error "codecs.h is specific to Unicode"
|
# error "codecs.h is specific to Unicode"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "pycore_byteswap.h" // _Py_bswap32()
|
#include "pycore_bitutils.h" // _Py_bswap32()
|
||||||
|
|
||||||
/* Mask to quickly check whether a C 'long' contains a
|
/* Mask to quickly check whether a C 'long' contains a
|
||||||
non-ASCII, UTF8-encoded char. */
|
non-ASCII, UTF8-encoded char. */
|
||||||
|
|
|
@ -170,7 +170,7 @@
|
||||||
<ClInclude Include="..\Include\internal\pycore_accu.h" />
|
<ClInclude Include="..\Include\internal\pycore_accu.h" />
|
||||||
<ClInclude Include="..\Include\internal\pycore_atomic.h" />
|
<ClInclude Include="..\Include\internal\pycore_atomic.h" />
|
||||||
<ClInclude Include="..\Include\internal\pycore_bytes_methods.h" />
|
<ClInclude Include="..\Include\internal\pycore_bytes_methods.h" />
|
||||||
<ClInclude Include="..\Include\internal\pycore_byteswap.h" />
|
<ClInclude Include="..\Include\internal\pycore_bitutils.h" />
|
||||||
<ClInclude Include="..\Include\internal\pycore_call.h" />
|
<ClInclude Include="..\Include\internal\pycore_call.h" />
|
||||||
<ClInclude Include="..\Include\internal\pycore_ceval.h" />
|
<ClInclude Include="..\Include\internal\pycore_ceval.h" />
|
||||||
<ClInclude Include="..\Include\internal\pycore_code.h" />
|
<ClInclude Include="..\Include\internal\pycore_code.h" />
|
||||||
|
|
|
@ -201,7 +201,7 @@
|
||||||
<ClInclude Include="..\Include\internal\pycore_atomic.h">
|
<ClInclude Include="..\Include\internal\pycore_atomic.h">
|
||||||
<Filter>Include</Filter>
|
<Filter>Include</Filter>
|
||||||
</ClInclude>
|
</ClInclude>
|
||||||
<ClInclude Include="..\Include\internal\pycore_byteswap.h">
|
<ClInclude Include="..\Include\internal\pycore_bitutils.h">
|
||||||
<Filter>Include</Filter>
|
<Filter>Include</Filter>
|
||||||
</ClInclude>
|
</ClInclude>
|
||||||
<ClInclude Include="..\Include\internal\pycore_bytes_methods.h">
|
<ClInclude Include="..\Include\internal\pycore_bytes_methods.h">
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#include "Python.h"
|
#include "Python.h"
|
||||||
|
|
||||||
|
#include "pycore_bitutils.h" // _Py_popcount32
|
||||||
#include "pycore_hamt.h"
|
#include "pycore_hamt.h"
|
||||||
#include "pycore_object.h" // _PyObject_GC_TRACK()
|
#include "pycore_object.h" // _PyObject_GC_TRACK()
|
||||||
#include <stddef.h> // offsetof()
|
#include <stddef.h> // offsetof()
|
||||||
|
@ -433,30 +434,10 @@ hamt_bitpos(int32_t hash, uint32_t shift)
|
||||||
return (uint32_t)1 << hamt_mask(hash, shift);
|
return (uint32_t)1 << hamt_mask(hash, shift);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline uint32_t
|
|
||||||
hamt_bitcount(uint32_t i)
|
|
||||||
{
|
|
||||||
/* We could use native popcount instruction but that would
|
|
||||||
require to either add configure flags to enable SSE4.2
|
|
||||||
support or to detect it dynamically. Otherwise, we have
|
|
||||||
a risk of CPython not working properly on older hardware.
|
|
||||||
|
|
||||||
In practice, there's no observable difference in
|
|
||||||
performance between using a popcount instruction or the
|
|
||||||
following fallback code.
|
|
||||||
|
|
||||||
The algorithm is copied from:
|
|
||||||
https://graphics.stanford.edu/~seander/bithacks.html
|
|
||||||
*/
|
|
||||||
i = i - ((i >> 1) & 0x55555555);
|
|
||||||
i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
|
|
||||||
return (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline uint32_t
|
static inline uint32_t
|
||||||
hamt_bitindex(uint32_t bitmap, uint32_t bit)
|
hamt_bitindex(uint32_t bitmap, uint32_t bit)
|
||||||
{
|
{
|
||||||
return hamt_bitcount(bitmap & (bit - 1));
|
return (uint32_t)_Py_popcount32(bitmap & (bit - 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -820,7 +801,7 @@ hamt_node_bitmap_assoc(PyHamtNode_Bitmap *self,
|
||||||
else {
|
else {
|
||||||
/* There was no key before with the same (shift,hash). */
|
/* There was no key before with the same (shift,hash). */
|
||||||
|
|
||||||
uint32_t n = hamt_bitcount(self->b_bitmap);
|
uint32_t n = (uint32_t)_Py_popcount32(self->b_bitmap);
|
||||||
|
|
||||||
if (n >= 16) {
|
if (n >= 16) {
|
||||||
/* When we have a situation where we want to store more
|
/* When we have a situation where we want to store more
|
||||||
|
|
Loading…
Reference in New Issue