gh-120196: Reuse find_max_char() for bytes objects (#120497)

2024-06-17 12:21:58 +02:00 · 2024-06-17 12:21:58 +02:00 · 945a89b48f
parent 21866c8ed2
commit 945a89b48f
2 changed files with 25 additions and 55 deletions
--- a/Objects/bytes_methods.c
+++ b/Objects/bytes_methods.c
@ -92,57 +92,6 @@ _Py_bytes_isalnum(const char *cptr, Py_ssize_t len)
 }
 PyDoc_STRVAR_shared(_Py_isascii__doc__,
 "B.isascii() -> bool\n\
 \n\
 Return True if B is empty or all characters in B are ASCII,\n\
 False otherwise.");
 // Optimization is copied from ascii_decode in unicodeobject.c
 /* Mask to quickly check whether a C 'size_t' contains a
   non-ASCII, UTF8-encoded char. */
 #if (SIZEOF_SIZE_T == 8)
 # define ASCII_CHAR_MASK 0x8080808080808080ULL
 #elif (SIZEOF_SIZE_T == 4)
 # define ASCII_CHAR_MASK 0x80808080U
 #else
 # error C 'size_t' size should be either 4 or 8!
 #endif
 PyObject*
 _Py_bytes_isascii(const char *cptr, Py_ssize_t len)
 {
    const char *p = cptr;
    const char *end = p + len;
    while (p < end) {
        /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
           for an explanation. */
        if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
            /* Help allocation */
            const char *_p = p;
            while (_p + SIZEOF_SIZE_T <= end) {
                size_t value = *(const size_t *) _p;
                if (value & ASCII_CHAR_MASK) {
                    Py_RETURN_FALSE;
                }
                _p += SIZEOF_SIZE_T;
            }
            p = _p;
            if (_p == end)
                break;
        }
        if ((unsigned char)*p & 0x80) {
            Py_RETURN_FALSE;
        }
        p++;
    }
    Py_RETURN_TRUE;
 }
 #undef ASCII_CHAR_MASK
 PyDoc_STRVAR_shared(_Py_isdigit__doc__,
 "B.isdigit() -> bool\n\
 \n\
@ -438,6 +387,7 @@ _Py_bytes_maketrans(Py_buffer *frm, Py_buffer *to)
 #include "stringlib/fastsearch.h"
 #include "stringlib/count.h"
 #include "stringlib/find.h"
 #include "stringlib/find_max_char.h"
 /*
 Wraps stringlib_parse_args_finds() and additionally checks the first
@ -765,3 +715,21 @@ _Py_bytes_endswith(const char *str, Py_ssize_t len, PyObject *subobj,
 {
    return _Py_bytes_tailmatch(str, len, "endswith", subobj, start, end, +1);
 }
 PyDoc_STRVAR_shared(_Py_isascii__doc__,
 "B.isascii() -> bool\n\
 \n\
 Return True if B is empty or all characters in B are ASCII,\n\
 False otherwise.");
 PyObject*
 _Py_bytes_isascii(const char *cptr, Py_ssize_t len)
 {
    const char *p = cptr;
    const char *end = p + len;
    Py_ssize_t max_char = stringlib_find_max_char(cptr, end);
    if (max_char > 127) {
        Py_RETURN_FALSE;
    }
    Py_RETURN_TRUE;
 }
--- a/Objects/stringlib/find_max_char.h
+++ b/Objects/stringlib/find_max_char.h
@ -1,6 +1,7 @@
 /* Finding the optimal width of unicode characters in a buffer */
-#if !STRINGLIB_IS_UNICODE
+/* find_max_char for one-byte will work for bytes objects as well. */
 #if !STRINGLIB_IS_UNICODE && STRINGLIB_SIZEOF_CHAR > 1
 # error "find_max_char.h is specific to Unicode"
 #endif
@ -20,19 +21,20 @@ Py_LOCAL_INLINE(Py_UCS4)
 STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
 {
    const unsigned char *p = (const unsigned char *) begin;
    const unsigned char *_end = (const unsigned char *)end;
-    while (p < end) {
+    while (p < _end) {
        if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
            /* Help register allocation */
            const unsigned char *_p = p;
-            while (_p + SIZEOF_SIZE_T <= end) {
+            while (_p + SIZEOF_SIZE_T <= _end) {
                size_t value = *(const size_t *) _p;
                if (value & UCS1_ASCII_CHAR_MASK)
                    return 255;
                _p += SIZEOF_SIZE_T;
            }
            p = _p;
-            if (p == end)
+            if (p == _end)
                break;
        }
        if (*p++ & 0x80)