mirror of https://github.com/python/cpython
gh-91576: Speed up iteration of strings (#91574)
This commit is contained in:
parent
a29f858124
commit
8c54c3dacc
|
@ -20,6 +20,7 @@ extern void _PyUnicode_Fini(PyInterpreterState *);
|
|||
extern void _PyUnicode_FiniTypes(PyInterpreterState *);
|
||||
extern void _PyStaticUnicode_Dealloc(PyObject *);
|
||||
|
||||
extern PyTypeObject _PyUnicodeASCIIIter_Type;
|
||||
|
||||
/* other API */
|
||||
|
||||
|
|
|
@ -9,6 +9,7 @@ import _string
|
|||
import codecs
|
||||
import itertools
|
||||
import operator
|
||||
import pickle
|
||||
import struct
|
||||
import sys
|
||||
import textwrap
|
||||
|
@ -185,6 +186,36 @@ class UnicodeTest(string_tests.CommonTest,
|
|||
self.assertEqual(next(it), "\u3333")
|
||||
self.assertRaises(StopIteration, next, it)
|
||||
|
||||
def test_iterators_invocation(self):
|
||||
cases = [type(iter('abc')), type(iter('🚀'))]
|
||||
for cls in cases:
|
||||
with self.subTest(cls=cls):
|
||||
self.assertRaises(TypeError, cls)
|
||||
|
||||
def test_iteration(self):
|
||||
cases = ['abc', '🚀🚀🚀', "\u1111\u2222\u3333"]
|
||||
for case in cases:
|
||||
with self.subTest(string=case):
|
||||
self.assertEqual(case, "".join(iter(case)))
|
||||
|
||||
def test_exhausted_iterator(self):
|
||||
cases = ['abc', '🚀🚀🚀', "\u1111\u2222\u3333"]
|
||||
for case in cases:
|
||||
with self.subTest(case=case):
|
||||
iterator = iter(case)
|
||||
tuple(iterator)
|
||||
self.assertRaises(StopIteration, next, iterator)
|
||||
|
||||
def test_pickle_iterator(self):
|
||||
cases = ['abc', '🚀🚀🚀', "\u1111\u2222\u3333"]
|
||||
for case in cases:
|
||||
with self.subTest(case=case):
|
||||
for proto in range(pickle.HIGHEST_PROTOCOL + 1):
|
||||
it = iter(case)
|
||||
with self.subTest(proto=proto):
|
||||
pickled = "".join(pickle.loads(pickle.dumps(it, proto)))
|
||||
self.assertEqual(case, pickled)
|
||||
|
||||
def test_count(self):
|
||||
string_tests.CommonTest.test_count(self)
|
||||
# check mixed argument types
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
Speed up iteration of ascii strings by 50%. Patch by Kumar Aditya.
|
|
@ -1936,6 +1936,7 @@ static PyTypeObject* static_types[] = {
|
|||
&_PyNamespace_Type,
|
||||
&_PyNone_Type,
|
||||
&_PyNotImplemented_Type,
|
||||
&_PyUnicodeASCIIIter_Type,
|
||||
&_PyUnion_Type,
|
||||
&_PyWeakref_CallableProxyType,
|
||||
&_PyWeakref_ProxyType,
|
||||
|
|
|
@ -15697,7 +15697,7 @@ unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
|
|||
static PyObject *
|
||||
unicodeiter_next(unicodeiterobject *it)
|
||||
{
|
||||
PyObject *seq, *item;
|
||||
PyObject *seq;
|
||||
|
||||
assert(it != NULL);
|
||||
seq = it->it_seq;
|
||||
|
@ -15709,10 +15709,8 @@ unicodeiter_next(unicodeiterobject *it)
|
|||
int kind = PyUnicode_KIND(seq);
|
||||
const void *data = PyUnicode_DATA(seq);
|
||||
Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
|
||||
item = PyUnicode_FromOrdinal(chr);
|
||||
if (item != NULL)
|
||||
++it->it_index;
|
||||
return item;
|
||||
it->it_index++;
|
||||
return unicode_char(chr);
|
||||
}
|
||||
|
||||
it->it_seq = NULL;
|
||||
|
@ -15720,6 +15718,29 @@ unicodeiter_next(unicodeiterobject *it)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
unicode_ascii_iter_next(unicodeiterobject *it)
|
||||
{
|
||||
assert(it != NULL);
|
||||
PyObject *seq = it->it_seq;
|
||||
if (seq == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
assert(_PyUnicode_CHECK(seq));
|
||||
assert(PyUnicode_IS_COMPACT_ASCII(seq));
|
||||
if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
|
||||
const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
|
||||
Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
|
||||
data, it->it_index);
|
||||
it->it_index++;
|
||||
PyObject *item = (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
|
||||
return Py_NewRef(item);
|
||||
}
|
||||
it->it_seq = NULL;
|
||||
Py_DECREF(seq);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
|
||||
{
|
||||
|
@ -15808,6 +15829,19 @@ PyTypeObject PyUnicodeIter_Type = {
|
|||
0,
|
||||
};
|
||||
|
||||
PyTypeObject _PyUnicodeASCIIIter_Type = {
|
||||
PyVarObject_HEAD_INIT(&PyType_Type, 0)
|
||||
.tp_name = "str_ascii_iterator",
|
||||
.tp_basicsize = sizeof(unicodeiterobject),
|
||||
.tp_dealloc = (destructor)unicodeiter_dealloc,
|
||||
.tp_getattro = PyObject_GenericGetAttr,
|
||||
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
|
||||
.tp_traverse = (traverseproc)unicodeiter_traverse,
|
||||
.tp_iter = PyObject_SelfIter,
|
||||
.tp_iternext = (iternextfunc)unicode_ascii_iter_next,
|
||||
.tp_methods = unicodeiter_methods,
|
||||
};
|
||||
|
||||
static PyObject *
|
||||
unicode_iter(PyObject *seq)
|
||||
{
|
||||
|
@ -15819,7 +15853,12 @@ unicode_iter(PyObject *seq)
|
|||
}
|
||||
if (PyUnicode_READY(seq) == -1)
|
||||
return NULL;
|
||||
it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
|
||||
if (PyUnicode_IS_COMPACT_ASCII(seq)) {
|
||||
it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
|
||||
}
|
||||
else {
|
||||
it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
|
||||
}
|
||||
if (it == NULL)
|
||||
return NULL;
|
||||
it->it_index = 0;
|
||||
|
|
Loading…
Reference in New Issue