From 794e7d1ab2d7afe70fe0dd87ca8174ac860413e4 Mon Sep 17 00:00:00 2001
From: Niklas Fiekas <niklas.fiekas@backscattering.de>
Date: Mon, 15 Jun 2020 14:33:48 +0200
Subject: [PATCH] bpo-29782: Consolidate _Py_Bit_Length() (GH-20739)

In GH-2866, _Py_Bit_Length() was added to pymath.h for lack of a better
location. GH-20518 added a more appropriate header file for bit utilities. It
also shows how to properly use intrinsics. This allows reconsidering bpo-29782.

* Move the function to the new header.
* Changed return type to match __builtin_clzl() and reviewed usage.
* Use intrinsics where available.
* Pick a fallback implementation suitable for inlining.
---
 Include/internal/pycore_bitutils.h | 47 +++++++++++++++++++++++++++---
 Include/pymath.h                   |  8 -----
 Modules/_testinternalcapi.c        | 40 +++++++++++++++++++++++++
 Modules/mathmodule.c               |  1 +
 Objects/longobject.c               | 25 ++++++++++------
 Python/pymath.c                    | 15 ----------
 6 files changed, 100 insertions(+), 36 deletions(-)

diff --git a/Include/internal/pycore_bitutils.h b/Include/internal/pycore_bitutils.h
index 36ffe23b9ff..0bd3270fe82 100644
--- a/Include/internal/pycore_bitutils.h
+++ b/Include/internal/pycore_bitutils.h
@@ -7,8 +7,8 @@
    - _Py_bswap64(uint64_t)
 */
 
-#ifndef Py_INTERNAL_BSWAP_H
-#define Py_INTERNAL_BSWAP_H
+#ifndef Py_INTERNAL_BITUTILS_H
+#define Py_INTERNAL_BITUTILS_H
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -131,8 +131,47 @@ _Py_popcount32(uint32_t x)
 }
 
 
+// Return the index of the most significant 1 bit in 'x'. This is the smallest
+// integer k such that x < 2**k. Equivalent to floor(log2(x)) + 1 for x != 0.
+static inline int
+_Py_bit_length(unsigned long x)
+{
+#if (defined(__clang__) || defined(__GNUC__))
+    if (x != 0) {
+        // __builtin_clzl() is available since GCC 3.4.
+        // Undefined behavior for x == 0.
+        return (int)sizeof(unsigned long) * 8 - __builtin_clzl(x);
+    }
+    else {
+        return 0;
+    }
+#elif defined(_MSC_VER)
+    // _BitScanReverse() is documented to search 32 bits.
+    Py_BUILD_ASSERT(sizeof(unsigned long) <= 4);
+    unsigned long msb;
+    if (_BitScanReverse(&msb, x)) {
+        return (int)msb + 1;
+    }
+    else {
+        return 0;
+    }
+#else
+    const int BIT_LENGTH_TABLE[32] = {
+        0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
+    };
+    int msb = 0;
+    while (x >= 32) {
+        msb += 6;
+        x >>= 6;
+    }
+    msb += BIT_LENGTH_TABLE[x];
+    return msb;
+#endif
+}
+
+
 #ifdef __cplusplus
 }
 #endif
-#endif /* !Py_INTERNAL_BSWAP_H */
-
+#endif /* !Py_INTERNAL_BITUTILS_H */
diff --git a/Include/pymath.h b/Include/pymath.h
index 63ca972784e..f869724334a 100644
--- a/Include/pymath.h
+++ b/Include/pymath.h
@@ -227,12 +227,4 @@ PyAPI_FUNC(void) _Py_set_387controlword(unsigned short);
  * behavior. */
 #define _Py_InIntegralTypeRange(type, v) (_Py_IntegralTypeMin(type) <= v && v <= _Py_IntegralTypeMax(type))
 
-/* Return the smallest integer k such that n < 2**k, or 0 if n == 0.
- * Equivalent to floor(log2(x))+1.  Also equivalent to: bitwidth_of_type -
- * count_leading_zero_bits(x)
- */
-#ifndef Py_LIMITED_API
-PyAPI_FUNC(unsigned int) _Py_bit_length(unsigned long d);
-#endif
-
 #endif /* Py_PYMATH_H */
diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c
index 6d5af5917f1..7970e2f4f44 100644
--- a/Modules/_testinternalcapi.c
+++ b/Modules/_testinternalcapi.c
@@ -102,6 +102,45 @@ test_popcount(PyObject *self, PyObject *Py_UNUSED(args))
 }
 
 
+static int
+check_bit_length(unsigned long x, int expected)
+{
+    // Use volatile to prevent the compiler to optimize out the whole test
+    volatile unsigned long u = x;
+    int len = _Py_bit_length(u);
+    if (len != expected) {
+        PyErr_Format(PyExc_AssertionError,
+                     "_Py_bit_length(%lu) returns %i, expected %i",
+                     x, len, expected);
+        return -1;
+    }
+    return 0;
+}
+
+
+static PyObject*
+test_bit_length(PyObject *self, PyObject *Py_UNUSED(args))
+{
+#define CHECK(X, RESULT) \
+    do { \
+        if (check_bit_length(X, RESULT) < 0) { \
+            return NULL; \
+        } \
+    } while (0)
+
+    CHECK(0, 0);
+    CHECK(1, 1);
+    CHECK(0x1000, 13);
+    CHECK(0x1234, 13);
+    CHECK(0x54321, 19);
+    CHECK(0x7FFFFFFF, 31);
+    CHECK(0xFFFFFFFF, 32);
+    Py_RETURN_NONE;
+
+#undef CHECK
+}
+
+
 #define TO_PTR(ch) ((void*)(uintptr_t)ch)
 #define FROM_PTR(ptr) ((uintptr_t)ptr)
 #define VALUE(key) (1 + ((int)(key) - 'a'))
@@ -197,6 +236,7 @@ static PyMethodDef TestMethods[] = {
     {"get_recursion_depth", get_recursion_depth, METH_NOARGS},
     {"test_bswap", test_bswap, METH_NOARGS},
     {"test_popcount", test_popcount, METH_NOARGS},
+    {"test_bit_length", test_bit_length, METH_NOARGS},
     {"test_hashtable", test_hashtable, METH_NOARGS},
     {NULL, NULL} /* sentinel */
 };
diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index cb05ce7c509..4450ce18941 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -53,6 +53,7 @@ raised for division by zero and mod by zero.
  */
 
 #include "Python.h"
+#include "pycore_bitutils.h"      // _Py_bit_length()
 #include "pycore_dtoa.h"
 #include "_math.h"
 
diff --git a/Objects/longobject.c b/Objects/longobject.c
index dead3e30694..d92a9c56a72 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -695,6 +695,13 @@ _PyLong_Sign(PyObject *vv)
     return Py_SIZE(v) == 0 ? 0 : (Py_SIZE(v) < 0 ? -1 : 1);
 }
 
+static int
+bit_length_digit(digit x)
+{
+    Py_BUILD_ASSERT(PyLong_SHIFT <= sizeof(unsigned long) * 8);
+    return _Py_bit_length((unsigned long)x);
+}
+
 size_t
 _PyLong_NumBits(PyObject *vv)
 {
@@ -712,7 +719,7 @@ _PyLong_NumBits(PyObject *vv)
         if ((size_t)(ndigits - 1) > SIZE_MAX / (size_t)PyLong_SHIFT)
             goto Overflow;
         result = (size_t)(ndigits - 1) * (size_t)PyLong_SHIFT;
-        msd_bits = _Py_bit_length(msd);
+        msd_bits = bit_length_digit(msd);
         if (SIZE_MAX - msd_bits < result)
             goto Overflow;
         result += msd_bits;
@@ -1822,7 +1829,7 @@ long_format_binary(PyObject *aa, int base, int alternate,
             return -1;
         }
         size_a_in_bits = (size_a - 1) * PyLong_SHIFT +
-                         _Py_bit_length(a->ob_digit[size_a - 1]);
+                         bit_length_digit(a->ob_digit[size_a - 1]);
         /* Allow 1 character for a '-' sign. */
         sz = negative + (size_a_in_bits + (bits - 1)) / bits;
     }
@@ -2642,7 +2649,7 @@ x_divrem(PyLongObject *v1, PyLongObject *w1, PyLongObject **prem)
 
     /* normalize: shift w1 left so that its top digit is >= PyLong_BASE/2.
        shift v1 left by the same amount.  Results go into w and v. */
-    d = PyLong_SHIFT - _Py_bit_length(w1->ob_digit[size_w-1]);
+    d = PyLong_SHIFT - bit_length_digit(w1->ob_digit[size_w-1]);
     carry = v_lshift(w->ob_digit, w1->ob_digit, size_w, d);
     assert(carry == 0);
     carry = v_lshift(v->ob_digit, v1->ob_digit, size_v, d);
@@ -2764,7 +2771,7 @@ _PyLong_Frexp(PyLongObject *a, Py_ssize_t *e)
         *e = 0;
         return 0.0;
     }
-    a_bits = _Py_bit_length(a->ob_digit[a_size-1]);
+    a_bits = bit_length_digit(a->ob_digit[a_size-1]);
     /* The following is an overflow-free version of the check
        "if ((a_size - 1) * PyLong_SHIFT + a_bits > PY_SSIZE_T_MAX) ..." */
     if (a_size >= (PY_SSIZE_T_MAX - 1) / PyLong_SHIFT + 1 &&
@@ -3857,8 +3864,8 @@ long_true_divide(PyObject *v, PyObject *w)
         /* Extreme underflow */
         goto underflow_or_zero;
     /* Next line is now safe from overflowing a Py_ssize_t */
-    diff = diff * PyLong_SHIFT + _Py_bit_length(a->ob_digit[a_size - 1]) -
-        _Py_bit_length(b->ob_digit[b_size - 1]);
+    diff = diff * PyLong_SHIFT + bit_length_digit(a->ob_digit[a_size - 1]) -
+        bit_length_digit(b->ob_digit[b_size - 1]);
     /* Now diff = a_bits - b_bits. */
     if (diff > DBL_MAX_EXP)
         goto overflow;
@@ -3934,7 +3941,7 @@ long_true_divide(PyObject *v, PyObject *w)
     }
     x_size = Py_ABS(Py_SIZE(x));
     assert(x_size > 0); /* result of division is never zero */
-    x_bits = (x_size-1)*PyLong_SHIFT+_Py_bit_length(x->ob_digit[x_size-1]);
+    x_bits = (x_size-1)*PyLong_SHIFT+bit_length_digit(x->ob_digit[x_size-1]);
 
     /* The number of extra bits that have to be rounded away. */
     extra_bits = Py_MAX(x_bits, DBL_MIN_EXP - shift) - DBL_MANT_DIG;
@@ -4748,7 +4755,7 @@ _PyLong_GCD(PyObject *aarg, PyObject *barg)
     alloc_b = Py_SIZE(b);
     /* reduce until a fits into 2 digits */
     while ((size_a = Py_SIZE(a)) > 2) {
-        nbits = _Py_bit_length(a->ob_digit[size_a-1]);
+        nbits = bit_length_digit(a->ob_digit[size_a-1]);
         /* extract top 2*PyLong_SHIFT bits of a into x, along with
            corresponding bits of b into y */
         size_b = Py_SIZE(b);
@@ -5269,7 +5276,7 @@ int_bit_length_impl(PyObject *self)
         return PyLong_FromLong(0);
 
     msd = ((PyLongObject *)self)->ob_digit[ndigits-1];
-    msd_bits = _Py_bit_length(msd);
+    msd_bits = bit_length_digit(msd);
 
     if (ndigits <= PY_SSIZE_T_MAX/PyLong_SHIFT)
         return PyLong_FromSsize_t((ndigits-1)*PyLong_SHIFT + msd_bits);
diff --git a/Python/pymath.c b/Python/pymath.c
index a08a0e79615..24b804223ee 100644
--- a/Python/pymath.c
+++ b/Python/pymath.c
@@ -79,18 +79,3 @@ round(double x)
     return copysign(y, x);
 }
 #endif /* HAVE_ROUND */
-
-static const unsigned int BitLengthTable[32] = {
-    0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
-    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
-};
-
-unsigned int _Py_bit_length(unsigned long d) {
-   unsigned int d_bits = 0;
-   while (d >= 32) {
-       d_bits += 6;
-       d >>= 6;
-   }
-   d_bits += BitLengthTable[d];
-   return d_bits;
-}