bpo-32677: Optimize str.isascii() (GH-5356)
This commit is contained in:
parent
ea8fc52e75
commit
bea57060c8
|
@ -916,6 +916,13 @@ class BaseTest:
|
||||||
self.checkequal(True, '\x00\x7f', 'isascii')
|
self.checkequal(True, '\x00\x7f', 'isascii')
|
||||||
self.checkequal(False, '\x80', 'isascii')
|
self.checkequal(False, '\x80', 'isascii')
|
||||||
self.checkequal(False, '\xe9', 'isascii')
|
self.checkequal(False, '\xe9', 'isascii')
|
||||||
|
# bytes.isascii() and bytearray.isascii() has optimization which
|
||||||
|
# check 4 or 8 bytes at once. So check some alignments.
|
||||||
|
for p in range(8):
|
||||||
|
self.checkequal(True, ' '*p + '\x7f', 'isascii')
|
||||||
|
self.checkequal(False, ' '*p + '\x80', 'isascii')
|
||||||
|
self.checkequal(True, ' '*p + '\x7f' + ' '*8, 'isascii')
|
||||||
|
self.checkequal(False, ' '*p + '\x80' + ' '*8, 'isascii')
|
||||||
|
|
||||||
def test_isdigit(self):
|
def test_isdigit(self):
|
||||||
self.checkequal(False, '', 'isdigit')
|
self.checkequal(False, '', 'isdigit')
|
||||||
|
|
|
@ -98,19 +98,51 @@ PyDoc_STRVAR_shared(_Py_isascii__doc__,
|
||||||
Return True if B is empty or all characters in B are ASCII,\n\
|
Return True if B is empty or all characters in B are ASCII,\n\
|
||||||
False otherwise.");
|
False otherwise.");
|
||||||
|
|
||||||
|
// Optimization is copied from ascii_decode in unicodeobject.c
|
||||||
|
/* Mask to quickly check whether a C 'long' contains a
|
||||||
|
non-ASCII, UTF8-encoded char. */
|
||||||
|
#if (SIZEOF_LONG == 8)
|
||||||
|
# define ASCII_CHAR_MASK 0x8080808080808080UL
|
||||||
|
#elif (SIZEOF_LONG == 4)
|
||||||
|
# define ASCII_CHAR_MASK 0x80808080UL
|
||||||
|
#else
|
||||||
|
# error C 'long' size should be either 4 or 8!
|
||||||
|
#endif
|
||||||
|
|
||||||
PyObject*
|
PyObject*
|
||||||
_Py_bytes_isascii(const char *cptr, Py_ssize_t len)
|
_Py_bytes_isascii(const char *cptr, Py_ssize_t len)
|
||||||
{
|
{
|
||||||
const unsigned char *p = (unsigned char *) cptr;
|
const char *p = cptr;
|
||||||
const unsigned char *e = p + len;
|
const char *end = p + len;
|
||||||
for (; p < e; p++) {
|
const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
|
||||||
if (*p >= 128) {
|
|
||||||
|
while (p < end) {
|
||||||
|
/* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
|
||||||
|
for an explanation. */
|
||||||
|
if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
|
||||||
|
/* Help allocation */
|
||||||
|
const char *_p = p;
|
||||||
|
while (_p < aligned_end) {
|
||||||
|
unsigned long value = *(unsigned long *) _p;
|
||||||
|
if (value & ASCII_CHAR_MASK) {
|
||||||
|
Py_RETURN_FALSE;
|
||||||
|
}
|
||||||
|
_p += SIZEOF_LONG;
|
||||||
|
}
|
||||||
|
p = _p;
|
||||||
|
if (_p == end)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if ((unsigned char)*p & 0x80) {
|
||||||
Py_RETURN_FALSE;
|
Py_RETURN_FALSE;
|
||||||
}
|
}
|
||||||
|
p++;
|
||||||
}
|
}
|
||||||
Py_RETURN_TRUE;
|
Py_RETURN_TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#undef ASCII_CHAR_MASK
|
||||||
|
|
||||||
|
|
||||||
PyDoc_STRVAR_shared(_Py_isdigit__doc__,
|
PyDoc_STRVAR_shared(_Py_isdigit__doc__,
|
||||||
"B.isdigit() -> bool\n\
|
"B.isdigit() -> bool\n\
|
||||||
|
|
Loading…
Reference in New Issue