From 75a6fadf369315b27e12f670e6295cf2c2cf7d7e Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 24 Oct 2022 01:57:30 +0300 Subject: [PATCH] gh-91524: Speed up the regular expression substitution (#91525) Functions re.sub() and re.subn() and corresponding re.Pattern methods are now 2-3 times faster for replacement strings containing group references. Closes #91524 Primarily authored by serhiy-storchaka Serhiy Storchaka Minor-cleanups-by: Gregory P. Smith [Google] --- Doc/whatsnew/3.12.rst | 5 + Lib/re/__init__.py | 22 +- Lib/re/_constants.py | 2 +- Lib/re/_parser.py | 45 +-- ...2-04-14-08-37-16.gh-issue-91524.g8PiIu.rst | 3 + Modules/_sre/clinic/sre.c.h | 41 ++- Modules/_sre/sre.c | 318 +++++++++++++++--- Modules/_sre/sre.h | 11 + Modules/_sre/sre_constants.h | 2 +- 9 files changed, 358 insertions(+), 91 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-04-14-08-37-16.gh-issue-91524.g8PiIu.rst diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst index 3e0b106c4a0..8f8a9946151 100644 --- a/Doc/whatsnew/3.12.rst +++ b/Doc/whatsnew/3.12.rst @@ -205,6 +205,11 @@ Optimizations process, which improves performance by 1-5%. (Contributed by Kevin Modzelewski in :gh:`90536`.) +* Speed up the regular expression substitution (functions :func:`re.sub` and + :func:`re.subn` and corresponding :class:`re.Pattern` methods) for + replacement strings containing group references by 2--3 times. + (Contributed by Serhiy Storchaka in :gh:`91524`.) + CPython bytecode changes ======================== diff --git a/Lib/re/__init__.py b/Lib/re/__init__.py index 8d6a4ef3880..4515650a721 100644 --- a/Lib/re/__init__.py +++ b/Lib/re/__init__.py @@ -124,6 +124,7 @@ This module also defines an exception 'error'. import enum from . import _compiler, _parser import functools +import _sre # public symbols @@ -230,7 +231,7 @@ def purge(): "Clear the regular expression caches" _cache.clear() _cache2.clear() - _compile_repl.cache_clear() + _compile_template.cache_clear() def template(pattern, flags=0): "Compile a template pattern, returning a Pattern object, deprecated" @@ -328,24 +329,9 @@ def _compile(pattern, flags): return p @functools.lru_cache(_MAXCACHE) -def _compile_repl(repl, pattern): +def _compile_template(pattern, repl): # internal: compile replacement pattern - return _parser.parse_template(repl, pattern) - -def _expand(pattern, match, template): - # internal: Match.expand implementation hook - template = _parser.parse_template(template, pattern) - return _parser.expand_template(template, match) - -def _subx(pattern, template): - # internal: Pattern.sub/subn implementation helper - template = _compile_repl(template, pattern) - if not template[0] and len(template[1]) == 1: - # literal replacement - return template[1][0] - def filter(match, template=template): - return _parser.expand_template(template, match) - return filter + return _sre.template(pattern, _parser.parse_template(repl, pattern)) # register myself for pickling diff --git a/Lib/re/_constants.py b/Lib/re/_constants.py index 10ee14bfab4..d8718d36075 100644 --- a/Lib/re/_constants.py +++ b/Lib/re/_constants.py @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20220615 +MAGIC = 20221023 from _sre import MAXREPEAT, MAXGROUPS diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py index 0d9cf632ea7..5709acb6267 100644 --- a/Lib/re/_parser.py +++ b/Lib/re/_parser.py @@ -984,24 +984,28 @@ def parse(str, flags=0, state=None): return p -def parse_template(source, state): +def parse_template(source, pattern): # parse 're' replacement string into list of literals and # group references s = Tokenizer(source) sget = s.get - groups = [] - literals = [] + result = [] literal = [] lappend = literal.append + def addliteral(): + if s.istext: + result.append(''.join(literal)) + else: + # The tokenizer implicitly decodes bytes objects as latin-1, we must + # therefore re-encode the final representation. + result.append(''.join(literal).encode('latin-1')) + del literal[:] def addgroup(index, pos): - if index > state.groups: + if index > pattern.groups: raise s.error("invalid group reference %d" % index, pos) - if literal: - literals.append(''.join(literal)) - del literal[:] - groups.append((len(literals), index)) - literals.append(None) - groupindex = state.groupindex + addliteral() + result.append(index) + groupindex = pattern.groupindex while True: this = sget() if this is None: @@ -1063,22 +1067,5 @@ def parse_template(source, state): lappend(this) else: lappend(this) - if literal: - literals.append(''.join(literal)) - if not isinstance(source, str): - # The tokenizer implicitly decodes bytes objects as latin-1, we must - # therefore re-encode the final representation. - literals = [None if s is None else s.encode('latin-1') for s in literals] - return groups, literals - -def expand_template(template, match): - g = match.group - empty = match.string[:0] - groups, literals = template - literals = literals[:] - try: - for index, group in groups: - literals[index] = g(group) or empty - except IndexError: - raise error("invalid group reference %d" % index) from None - return empty.join(literals) + addliteral() + return result diff --git a/Misc/NEWS.d/next/Library/2022-04-14-08-37-16.gh-issue-91524.g8PiIu.rst b/Misc/NEWS.d/next/Library/2022-04-14-08-37-16.gh-issue-91524.g8PiIu.rst new file mode 100644 index 00000000000..b3f01755eaa --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-04-14-08-37-16.gh-issue-91524.g8PiIu.rst @@ -0,0 +1,3 @@ +Speed up the regular expression substitution (functions :func:`re.sub` and +:func:`re.subn` and corresponding :class:`re.Pattern` methods) for +replacement strings containing group references by 2--3 times. diff --git a/Modules/_sre/clinic/sre.c.h b/Modules/_sre/clinic/sre.c.h index 711e16a1190..da641081ce9 100644 --- a/Modules/_sre/clinic/sre.c.h +++ b/Modules/_sre/clinic/sre.c.h @@ -1068,6 +1068,45 @@ exit: return return_value; } +PyDoc_STRVAR(_sre_template__doc__, +"template($module, pattern, template, /)\n" +"--\n" +"\n" +"\n" +"\n" +" template\n" +" A list containing interleaved literal strings (str or bytes) and group\n" +" indices (int), as returned by re._parser.parse_template():\n" +" [literal1, group1, ..., literalN, groupN]"); + +#define _SRE_TEMPLATE_METHODDEF \ + {"template", _PyCFunction_CAST(_sre_template), METH_FASTCALL, _sre_template__doc__}, + +static PyObject * +_sre_template_impl(PyObject *module, PyObject *pattern, PyObject *template); + +static PyObject * +_sre_template(PyObject *module, PyObject *const *args, Py_ssize_t nargs) +{ + PyObject *return_value = NULL; + PyObject *pattern; + PyObject *template; + + if (!_PyArg_CheckPositional("template", nargs, 2, 2)) { + goto exit; + } + pattern = args[0]; + if (!PyList_Check(args[1])) { + _PyArg_BadArgument("template", "argument 2", "list", args[1]); + goto exit; + } + template = args[1]; + return_value = _sre_template_impl(module, pattern, template); + +exit: + return return_value; +} + PyDoc_STRVAR(_sre_SRE_Match_expand__doc__, "expand($self, /, template)\n" "--\n" @@ -1421,4 +1460,4 @@ _sre_SRE_Scanner_search(ScannerObject *self, PyTypeObject *cls, PyObject *const } return _sre_SRE_Scanner_search_impl(self, cls); } -/*[clinic end generated code: output=14ea86f85c130a7b input=a9049054013a1b77]*/ +/*[clinic end generated code: output=e3ba72156dd71572 input=a9049054013a1b77]*/ diff --git a/Modules/_sre/sre.c b/Modules/_sre/sre.c index bcb30848d9a..aae02652664 100644 --- a/Modules/_sre/sre.c +++ b/Modules/_sre/sre.c @@ -51,13 +51,6 @@ static const char copyright[] = #include -/* name of this module, minus the leading underscore */ -#if !defined(SRE_MODULE) -#define SRE_MODULE "sre" -#endif - -#define SRE_PY_MODULE "re" - /* defining this one enables tracing */ #undef VERBOSE @@ -254,6 +247,8 @@ typedef struct { PyTypeObject *Pattern_Type; PyTypeObject *Match_Type; PyTypeObject *Scanner_Type; + PyTypeObject *Template_Type; + PyObject *compile_template; // reference to re._compile_template } _sremodulestate; static _sremodulestate * @@ -757,23 +752,6 @@ _sre_SRE_Pattern_search_impl(PatternObject *self, PyTypeObject *cls, return match; } -static PyObject* -call(const char* module, const char* function, PyObject* args) -{ - PyObject* func; - PyObject* result; - - if (!args) - return NULL; - func = _PyImport_GetModuleAttrString(module, function); - if (!func) - return NULL; - result = PyObject_CallObject(func, args); - Py_DECREF(func); - Py_DECREF(args); - return result; -} - /*[clinic input] _sre.SRE_Pattern.findall @@ -1036,6 +1014,57 @@ error: } +static PyObject * +compile_template(_sremodulestate *module_state, + PatternObject *pattern, PyObject *template) +{ + /* delegate to Python code */ + PyObject *func = module_state->compile_template; + if (func == NULL) { + func = _PyImport_GetModuleAttrString("re", "_compile_template"); + if (func == NULL) { + return NULL; + } + Py_XSETREF(module_state->compile_template, func); + } + + PyObject *args[] = {(PyObject *)pattern, template}; + PyObject *result = PyObject_Vectorcall(func, args, 2, NULL); + + if (result == NULL && PyErr_ExceptionMatches(PyExc_TypeError)) { + /* If the replacement string is unhashable (e.g. bytearray), + * convert it to the basic type (str or bytes) and repeat. */ + if (PyUnicode_Check(template) && !PyUnicode_CheckExact(template)) { + PyErr_Clear(); + template = _PyUnicode_Copy(template); + } + else if (PyObject_CheckBuffer(template) && !PyBytes_CheckExact(template)) { + PyErr_Clear(); + template = PyBytes_FromObject(template); + } + else { + return NULL; + } + if (template == NULL) { + return NULL; + } + args[1] = template; + result = PyObject_Vectorcall(func, args, 2, NULL); + Py_DECREF(template); + } + + if (result != NULL && Py_TYPE(result) != module_state->Template_Type) { + PyErr_Format(PyExc_RuntimeError, + "the result of compiling a replacement string is %.200s", + Py_TYPE(result)->tp_name); + Py_DECREF(result); + return NULL; + } + return result; +} + +static PyObject *expand_template(TemplateObject *, MatchObject *); /* Forward */ + static PyObject* pattern_subx(_sremodulestate* module_state, PatternObject* self, @@ -1055,14 +1084,14 @@ pattern_subx(_sremodulestate* module_state, Py_ssize_t n; Py_ssize_t i, b, e; int isbytes, charsize; - int filter_is_callable; + enum {LITERAL, TEMPLATE, CALLABLE} filter_type; Py_buffer view; if (PyCallable_Check(ptemplate)) { /* sub/subn takes either a function or a template */ filter = ptemplate; Py_INCREF(filter); - filter_is_callable = 1; + filter_type = CALLABLE; } else { /* if not callable, check if it's a literal string */ int literal; @@ -1082,16 +1111,22 @@ pattern_subx(_sremodulestate* module_state, if (literal) { filter = ptemplate; Py_INCREF(filter); - filter_is_callable = 0; + filter_type = LITERAL; } else { /* not a literal; hand it over to the template compiler */ - filter = call( - SRE_PY_MODULE, "_subx", - PyTuple_Pack(2, self, ptemplate) - ); + filter = compile_template(module_state, self, ptemplate); if (!filter) return NULL; - filter_is_callable = PyCallable_Check(filter); + + assert(Py_TYPE(filter) == module_state->Template_Type); + if (Py_SIZE(filter) == 0) { + Py_INCREF(((TemplateObject *)filter)->literal); + Py_SETREF(filter, ((TemplateObject *)filter)->literal); + filter_type = LITERAL; + } + else { + filter_type = TEMPLATE; + } } } @@ -1142,12 +1177,19 @@ pattern_subx(_sremodulestate* module_state, } - if (filter_is_callable) { + if (filter_type != LITERAL) { /* pass match object through filter */ match = pattern_new_match(module_state, self, &state, 1); if (!match) goto error; - item = PyObject_CallOneArg(filter, match); + if (filter_type == TEMPLATE) { + item = expand_template((TemplateObject *)filter, + (MatchObject *)match); + } + else { + assert(filter_type == CALLABLE); + item = PyObject_CallOneArg(filter, match); + } Py_DECREF(match); if (!item) goto error; @@ -1482,6 +1524,69 @@ _sre_compile_impl(PyObject *module, PyObject *pattern, int flags, return (PyObject*) self; } +/*[clinic input] +_sre.template + + pattern: object + template: object(subclass_of="&PyList_Type") + A list containing interleaved literal strings (str or bytes) and group + indices (int), as returned by re._parser.parse_template(): + [literal1, group1, ..., literalN, groupN] + / + +[clinic start generated code]*/ + +static PyObject * +_sre_template_impl(PyObject *module, PyObject *pattern, PyObject *template) +/*[clinic end generated code: output=d51290e596ebca86 input=af55380b27f02942]*/ +{ + /* template is a list containing interleaved literal strings (str or bytes) + * and group indices (int), as returned by _parser.parse_template: + * [literal1, group1, literal2, ..., literalN]. + */ + _sremodulestate *module_state = get_sre_module_state(module); + TemplateObject *self = NULL; + Py_ssize_t n = PyList_GET_SIZE(template); + if ((n & 1) == 0 || n < 1) { + goto bad_template; + } + n /= 2; + self = PyObject_GC_NewVar(TemplateObject, module_state->Template_Type, n); + if (!self) + return NULL; + self->chunks = 1 + 2*n; + self->literal = PyList_GET_ITEM(template, 0); + Py_INCREF(self->literal); + for (Py_ssize_t i = 0; i < n; i++) { + Py_ssize_t index = PyLong_AsSsize_t(PyList_GET_ITEM(template, 2*i+1)); + if (index == -1 && PyErr_Occurred()) { + Py_DECREF(self); + return NULL; + } + if (index < 0) { + goto bad_template; + } + self->items[i].index = index; + + PyObject *literal = PyList_GET_ITEM(template, 2*i+2); + // Skip empty literals. + if ((PyUnicode_Check(literal) && !PyUnicode_GET_LENGTH(literal)) || + (PyBytes_Check(literal) && !PyBytes_GET_SIZE(literal))) + { + literal = NULL; + self->chunks--; + } + Py_XINCREF(literal); + self->items[i].literal = literal; + } + return (PyObject*) self; + +bad_template: + PyErr_SetString(PyExc_TypeError, "invalid template"); + Py_XDECREF(self); + return NULL; +} + /* -------------------------------------------------------------------- */ /* Code validation */ @@ -2096,11 +2201,14 @@ static PyObject * _sre_SRE_Match_expand_impl(MatchObject *self, PyObject *template) /*[clinic end generated code: output=931b58ccc323c3a1 input=4bfdb22c2f8b146a]*/ { - /* delegate to Python code */ - return call( - SRE_PY_MODULE, "_expand", - PyTuple_Pack(3, self->pattern, self, template) - ); + _sremodulestate *module_state = get_sre_module_state_by_class(Py_TYPE(self)); + PyObject *filter = compile_template(module_state, self->pattern, template); + if (filter == NULL) { + return NULL; + } + PyObject *result = expand_template((TemplateObject *)filter, self); + Py_DECREF(filter); + return result; } static PyObject* @@ -2685,6 +2793,112 @@ pattern_scanner(_sremodulestate *module_state, return (PyObject*) scanner; } +/* -------------------------------------------------------------------- */ +/* template methods */ + +static int +template_traverse(TemplateObject *self, visitproc visit, void *arg) +{ + Py_VISIT(Py_TYPE(self)); + Py_VISIT(self->literal); + for (Py_ssize_t i = 0, n = Py_SIZE(self); i < n; i++) { + Py_VISIT(self->items[i].literal); + } + return 0; +} + +static int +template_clear(TemplateObject *self) +{ + Py_CLEAR(self->literal); + for (Py_ssize_t i = 0, n = Py_SIZE(self); i < n; i++) { + Py_CLEAR(self->items[i].literal); + } + return 0; +} + +static void +template_dealloc(TemplateObject *self) +{ + PyTypeObject *tp = Py_TYPE(self); + + PyObject_GC_UnTrack(self); + (void)template_clear(self); + tp->tp_free(self); + Py_DECREF(tp); +} + +static PyObject * +expand_template(TemplateObject *self, MatchObject *match) +{ + if (Py_SIZE(self) == 0) { + Py_INCREF(self->literal); + return self->literal; + } + + PyObject *result = NULL; + Py_ssize_t count = 0; // the number of non-empty chunks + /* For small number of strings use a buffer allocated on the stack, + * otherwise use a list object. */ + PyObject *buffer[10]; + PyObject **out = buffer; + PyObject *list = NULL; + if (self->chunks > (int)Py_ARRAY_LENGTH(buffer) || + !PyUnicode_Check(self->literal)) + { + list = PyList_New(self->chunks); + if (!list) { + return NULL; + } + out = &PyList_GET_ITEM(list, 0); + } + + Py_INCREF(self->literal); + out[count++] = self->literal; + for (Py_ssize_t i = 0; i < Py_SIZE(self); i++) { + Py_ssize_t index = self->items[i].index; + if (index >= match->groups) { + PyErr_SetString(PyExc_IndexError, "no such group"); + goto cleanup; + } + PyObject *item = match_getslice_by_index(match, index, Py_None); + if (item == NULL) { + goto cleanup; + } + if (item != Py_None) { + Py_INCREF(item); + out[count++] = item; + } + Py_DECREF(item); + + PyObject *literal = self->items[i].literal; + if (literal != NULL) { + Py_INCREF(literal); + out[count++] = literal; + } + } + + if (PyUnicode_Check(self->literal)) { + result = _PyUnicode_JoinArray(&_Py_STR(empty), out, count); + } + else { + Py_SET_SIZE(list, count); + result = _PyBytes_Join((PyObject *)&_Py_SINGLETON(bytes_empty), list); + } + +cleanup: + if (list) { + Py_DECREF(list); + } + else { + for (Py_ssize_t i = 0; i < count; i++) { + Py_DECREF(out[i]); + } + } + return result; +} + + static Py_hash_t pattern_hash(PatternObject *self) { @@ -2907,15 +3121,32 @@ static PyType_Slot scanner_slots[] = { }; static PyType_Spec scanner_spec = { - .name = "_" SRE_MODULE ".SRE_Scanner", + .name = "_sre.SRE_Scanner", .basicsize = sizeof(ScannerObject), .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE | Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC), .slots = scanner_slots, }; +static PyType_Slot template_slots[] = { + {Py_tp_dealloc, template_dealloc}, + {Py_tp_traverse, template_traverse}, + {Py_tp_clear, template_clear}, + {0, NULL}, +}; + +static PyType_Spec template_spec = { + .name = "_sre.SRE_Template", + .basicsize = sizeof(TemplateObject), + .itemsize = sizeof(((TemplateObject *)0)->items[0]), + .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE | + Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC), + .slots = template_slots, +}; + static PyMethodDef _functions[] = { _SRE_COMPILE_METHODDEF + _SRE_TEMPLATE_METHODDEF _SRE_GETCODESIZE_METHODDEF _SRE_ASCII_ISCASED_METHODDEF _SRE_UNICODE_ISCASED_METHODDEF @@ -2932,6 +3163,8 @@ sre_traverse(PyObject *module, visitproc visit, void *arg) Py_VISIT(state->Pattern_Type); Py_VISIT(state->Match_Type); Py_VISIT(state->Scanner_Type); + Py_VISIT(state->Template_Type); + Py_VISIT(state->compile_template); return 0; } @@ -2944,6 +3177,8 @@ sre_clear(PyObject *module) Py_CLEAR(state->Pattern_Type); Py_CLEAR(state->Match_Type); Py_CLEAR(state->Scanner_Type); + Py_CLEAR(state->Template_Type); + Py_CLEAR(state->compile_template); return 0; } @@ -2984,6 +3219,7 @@ sre_exec(PyObject *m) CREATE_TYPE(m, state->Pattern_Type, &pattern_spec); CREATE_TYPE(m, state->Match_Type, &match_spec); CREATE_TYPE(m, state->Scanner_Type, &scanner_spec); + CREATE_TYPE(m, state->Template_Type, &template_spec); if (PyModule_AddIntConstant(m, "MAGIC", SRE_MAGIC) < 0) { goto error; @@ -3013,7 +3249,7 @@ static PyModuleDef_Slot sre_slots[] = { static struct PyModuleDef sremodule = { .m_base = PyModuleDef_HEAD_INIT, - .m_name = "_" SRE_MODULE, + .m_name = "_sre", .m_size = sizeof(_sremodulestate), .m_methods = _functions, .m_slots = sre_slots, diff --git a/Modules/_sre/sre.h b/Modules/_sre/sre.h index 52ae3e11b5f..d967d9ea04b 100644 --- a/Modules/_sre/sre.h +++ b/Modules/_sre/sre.h @@ -52,6 +52,17 @@ typedef struct { Py_ssize_t mark[1]; } MatchObject; +typedef struct { + PyObject_VAR_HEAD + Py_ssize_t chunks; /* the number of group references and non-NULL literals + * self->chunks <= 2*Py_SIZE(self) + 1 */ + PyObject *literal; + struct { + Py_ssize_t index; + PyObject *literal; /* NULL if empty */ + } items[0]; +} TemplateObject; + typedef struct SRE_REPEAT_T { Py_ssize_t count; const SRE_CODE* pattern; /* points to REPEAT operator arguments */ diff --git a/Modules/_sre/sre_constants.h b/Modules/_sre/sre_constants.h index f030815c6c0..b5692292f65 100644 --- a/Modules/_sre/sre_constants.h +++ b/Modules/_sre/sre_constants.h @@ -11,7 +11,7 @@ * See the sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20220615 +#define SRE_MAGIC 20221023 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2