diff --git a/Lib/sre.py b/Lib/sre.py index 701334e5db1..e7517f598fc 100644 --- a/Lib/sre.py +++ b/Lib/sre.py @@ -17,15 +17,13 @@ r"""Support for regular expressions (RE). This module provides regular expression matching operations similar to -those found in Perl. It's 8-bit clean: the strings being processed may -contain both null bytes and characters whose high bit is set. Regular -expression pattern strings may not contain null bytes, but can specify -the null byte using the \\number notation. Characters with the high -bit set may be included. +those found in Perl. It supports both 8-bit and Unicode strings; both +the pattern and the strings being processed can contain null bytes and +characters outside the US ASCII range. -Regular expressions can contain both special and ordinary -characters. Most ordinary characters, like "A", "a", or "0", are the -simplest regular expressions; they simply match themselves. You can +Regular expressions can contain both special and ordinary characters. +Most ordinary characters, like "A", "a", or "0", are the simplest +regular expressions; they simply match themselves. You can concatenate ordinary characters, so last matches the string 'last'. The special characters are: @@ -45,7 +43,7 @@ The special characters are: "|" A|B, creates an RE that will match either A or B. (...) Matches the RE inside the parentheses. The contents can be retrieved or matched later in the string. - (?iLmsx) Set the I, L, M, S, or X flag for the RE (see below). + (?iLmsux) Set the I, L, M, S, U, or X flag for the RE (see below). (?:...) Non-grouping version of regular parentheses. (?P...) The substring matched by the group is accessible by name. (?P=name) Matches the text matched earlier by the group named name. @@ -54,7 +52,7 @@ The special characters are: (?!...) Matches if ... doesn't match next. The special sequences consist of "\\" and a character from the list -below. If the ordinary character is not on the list, then the +below. If the ordinary character is not on the list, then the resulting RE will match the second character. \number Matches the contents of the group of the same number. \A Matches only at the start of the string. @@ -246,76 +244,13 @@ def _expand(pattern, match, template): def _subx(pattern, template): # internal: pattern.sub/subn implementation helper - if callable(template): - filter = template - else: - template = _compile_repl(template, pattern) - if not template[0] and len(template[1]) == 1: - # literal replacement - filter = template[1][0] - else: - def filter(match, template=template): - return sre_parse.expand_template(template, match) - return filter - -def _sub(pattern, template, text, count=0): - # internal: pattern.sub implementation hook - # FIXME: not used in SRE 2.2.1 and later; will be removed soon - return _subn(pattern, template, text, count)[0] - -def _subn(pattern, template, text, count=0): - # internal: pattern.subn implementation hook - # FIXME: not used in SRE 2.2.1 and later; will be removed soon - filter = _subx(pattern, template) - if not callable(filter): + template = _compile_repl(template, pattern) + if not template[0] and len(template[1]) == 1: # literal replacement - def filter(match, literal=filter): - return literal - n = i = 0 - s = [] - append = s.append - c = pattern.scanner(text) - while not count or n < count: - m = c.search() - if not m: - break - b, e = m.span() - if i < b: - append(text[i:b]) - elif i == b == e and n: - append(text[i:b]) - continue # ignore empty match at previous position - append(filter(m)) - i = e - n = n + 1 - append(text[i:]) - return _join(s, text[:0]), n - -def _split(pattern, text, maxsplit=0): - # internal: pattern.split implementation hook - # FIXME: not used in SRE 2.2.1 and later; will be removed soon - n = i = 0 - s = [] - append = s.append - extend = s.extend - c = pattern.scanner(text) - g = pattern.groups - while not maxsplit or n < maxsplit: - m = c.search() - if not m: - break - b, e = m.span() - if b == e: - if i >= len(text): - break - continue - append(text[i:b]) - if g and b != e: - extend(list(m.groups())) - i = e - n = n + 1 - append(text[i:]) - return s + return template[1][0] + def filter(match, template=template): + return sre_parse.expand_template(template, match) + return filter # register myself for pickling diff --git a/Modules/_sre.c b/Modules/_sre.c index 3a2d47c997c..5573046d1fa 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -76,10 +76,6 @@ static char copyright[] = /* -------------------------------------------------------------------- */ /* optional features */ -/* test: define to use sre.py helpers instead of C code */ -#undef USE_PYTHON_SPLIT -#undef USE_PYTHON_SUB - /* prevent run-away recursion (bad patterns on long strings) */ #if !defined(USE_STACKCHECK) @@ -1251,6 +1247,8 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr)); state->start = ptr; state->ptr = ++ptr; + if (flags & SRE_INFO_LITERAL) + return 1; /* we got all of it */ status = SRE_MATCH(state, pattern + 2, 1); if (status != 0) break; @@ -1820,66 +1818,6 @@ join(PyObject* list, PyObject* pattern) return result; } - -#ifdef USE_PYTHON_SUB -static PyObject* -pattern_sub(PatternObject* self, PyObject* args, PyObject* kw) -{ - PyObject* template; - PyObject* string; - PyObject* count = Py_False; /* zero */ - static char* kwlist[] = { "repl", "string", "count", NULL }; - if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|O:sub", kwlist, - &template, &string, &count)) - return NULL; - - /* delegate to Python code */ - return call( - SRE_MODULE, "_sub", - Py_BuildValue("OOOO", self, template, string, count) - ); -} -#endif - -#ifdef USE_PYTHON_SUB -static PyObject* -pattern_subn(PatternObject* self, PyObject* args, PyObject* kw) -{ - PyObject* template; - PyObject* string; - PyObject* count = Py_False; /* zero */ - static char* kwlist[] = { "repl", "string", "count", NULL }; - if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|O:subn", kwlist, - &template, &string, &count)) - return NULL; - - /* delegate to Python code */ - return call( - SRE_MODULE, "_subn", - Py_BuildValue("OOOO", self, template, string, count) - ); -} -#endif - -#if defined(USE_PYTHON_SPLIT) -static PyObject* -pattern_split(PatternObject* self, PyObject* args, PyObject* kw) -{ - PyObject* string; - PyObject* maxsplit = Py_False; /* zero */ - static char* kwlist[] = { "source", "maxsplit", NULL }; - if (!PyArg_ParseTupleAndKeywords(args, kw, "O|O:split", kwlist, - &string, &maxsplit)) - return NULL; - - /* delegate to Python code */ - return call( - SRE_MODULE, "_split", - Py_BuildValue("OOO", self, string, maxsplit) - ); -} -#endif - static PyObject* pattern_findall(PatternObject* self, PyObject* args, PyObject* kw) { @@ -1980,7 +1918,6 @@ error: } -#if !defined(USE_PYTHON_SPLIT) static PyObject* pattern_split(PatternObject* self, PyObject* args, PyObject* kw) { @@ -2071,15 +2008,16 @@ pattern_split(PatternObject* self, PyObject* args, PyObject* kw) } /* get segment following last match */ - item = PySequence_GetSlice( - string, STATE_OFFSET(&state, last), state.endpos - ); - if (!item) - goto error; - status = PyList_Append(list, item); - Py_DECREF(item); - if (status < 0) - goto error; + i = STATE_OFFSET(&state, last); + if (i < state.endpos) { + item = PySequence_GetSlice(string, i, state.endpos); + if (!item) + goto error; + status = PyList_Append(list, item); + Py_DECREF(item); + if (status < 0) + goto error; + } state_fini(&state); return list; @@ -2090,9 +2028,7 @@ error: return NULL; } -#endif -#if !defined(USE_PYTHON_SUB) static PyObject* pattern_subx(PatternObject* self, PyObject* template, PyObject* string, int count, int subn) @@ -2108,15 +2044,22 @@ pattern_subx(PatternObject* self, PyObject* template, PyObject* string, int i, b, e; int filter_is_callable; - /* call subx helper to get the filter */ - filter = call( - SRE_MODULE, "_subx", - Py_BuildValue("OO", self, template) - ); - if (!filter) - return NULL; - - filter_is_callable = PyCallable_Check(filter); + if (PyCallable_Check(template)) { + /* sub/subn takes either a function or a template */ + filter = template; + Py_INCREF(filter); + filter_is_callable = 1; + } else { + /* if not callable, call the template compiler. it may return + either a filter function or a literal string */ + filter = call( + SRE_MODULE, "_subx", + Py_BuildValue("OO", self, template) + ); + if (!filter) + return NULL; + filter_is_callable = PyCallable_Check(filter); + } string = state_init(&state, self, string, 0, INT_MAX); if (!string) @@ -2169,7 +2112,7 @@ pattern_subx(PatternObject* self, PyObject* template, PyObject* string, goto next; if (filter_is_callable) { - /* filter match */ + /* pass match object through filter */ match = pattern_new_match(self, &state, 1); if (!match) goto error; @@ -2186,7 +2129,7 @@ pattern_subx(PatternObject* self, PyObject* template, PyObject* string, } else { /* filter is literal string */ item = filter; - Py_INCREF(filter); + Py_INCREF(item); } /* add to list */ @@ -2208,18 +2151,21 @@ next: } /* get segment following last match */ - item = PySequence_GetSlice(string, i, state.endpos); - if (!item) - goto error; - status = PyList_Append(list, item); - Py_DECREF(item); - if (status < 0) - goto error; + if (i < state.endpos) { + item = PySequence_GetSlice(string, i, state.endpos); + if (!item) + goto error; + status = PyList_Append(list, item); + Py_DECREF(item); + if (status < 0) + goto error; + } state_fini(&state); - /* convert list to single string */ + /* convert list to single string (also removes list) */ item = join(list, self->pattern); + if (!item) return NULL; @@ -2262,7 +2208,6 @@ pattern_subn(PatternObject* self, PyObject* args, PyObject* kw) return pattern_subx(self, template, string, count, 1); } -#endif static PyObject* pattern_copy(PatternObject* self, PyObject* args)