fixed character set description in docstring (SRE uses Python
strings, not C strings) removed USE_PYTHON defines, and related sre.py helpers skip calling the subx helper if the template is callable. interestingly enough, this means that def callback(m): return literal result = pattern.sub(callback, string) is much faster than result = pattern.sub(literal, string)
This commit is contained in:
parent
0402dd18cb
commit
dac58492aa
81
Lib/sre.py
81
Lib/sre.py
|
@ -17,15 +17,13 @@
|
||||||
r"""Support for regular expressions (RE).
|
r"""Support for regular expressions (RE).
|
||||||
|
|
||||||
This module provides regular expression matching operations similar to
|
This module provides regular expression matching operations similar to
|
||||||
those found in Perl. It's 8-bit clean: the strings being processed may
|
those found in Perl. It supports both 8-bit and Unicode strings; both
|
||||||
contain both null bytes and characters whose high bit is set. Regular
|
the pattern and the strings being processed can contain null bytes and
|
||||||
expression pattern strings may not contain null bytes, but can specify
|
characters outside the US ASCII range.
|
||||||
the null byte using the \\number notation. Characters with the high
|
|
||||||
bit set may be included.
|
|
||||||
|
|
||||||
Regular expressions can contain both special and ordinary
|
Regular expressions can contain both special and ordinary characters.
|
||||||
characters. Most ordinary characters, like "A", "a", or "0", are the
|
Most ordinary characters, like "A", "a", or "0", are the simplest
|
||||||
simplest regular expressions; they simply match themselves. You can
|
regular expressions; they simply match themselves. You can
|
||||||
concatenate ordinary characters, so last matches the string 'last'.
|
concatenate ordinary characters, so last matches the string 'last'.
|
||||||
|
|
||||||
The special characters are:
|
The special characters are:
|
||||||
|
@ -45,7 +43,7 @@ The special characters are:
|
||||||
"|" A|B, creates an RE that will match either A or B.
|
"|" A|B, creates an RE that will match either A or B.
|
||||||
(...) Matches the RE inside the parentheses.
|
(...) Matches the RE inside the parentheses.
|
||||||
The contents can be retrieved or matched later in the string.
|
The contents can be retrieved or matched later in the string.
|
||||||
(?iLmsx) Set the I, L, M, S, or X flag for the RE (see below).
|
(?iLmsux) Set the I, L, M, S, U, or X flag for the RE (see below).
|
||||||
(?:...) Non-grouping version of regular parentheses.
|
(?:...) Non-grouping version of regular parentheses.
|
||||||
(?P<name>...) The substring matched by the group is accessible by name.
|
(?P<name>...) The substring matched by the group is accessible by name.
|
||||||
(?P=name) Matches the text matched earlier by the group named name.
|
(?P=name) Matches the text matched earlier by the group named name.
|
||||||
|
@ -246,77 +244,14 @@ def _expand(pattern, match, template):
|
||||||
|
|
||||||
def _subx(pattern, template):
|
def _subx(pattern, template):
|
||||||
# internal: pattern.sub/subn implementation helper
|
# internal: pattern.sub/subn implementation helper
|
||||||
if callable(template):
|
|
||||||
filter = template
|
|
||||||
else:
|
|
||||||
template = _compile_repl(template, pattern)
|
template = _compile_repl(template, pattern)
|
||||||
if not template[0] and len(template[1]) == 1:
|
if not template[0] and len(template[1]) == 1:
|
||||||
# literal replacement
|
# literal replacement
|
||||||
filter = template[1][0]
|
return template[1][0]
|
||||||
else:
|
|
||||||
def filter(match, template=template):
|
def filter(match, template=template):
|
||||||
return sre_parse.expand_template(template, match)
|
return sre_parse.expand_template(template, match)
|
||||||
return filter
|
return filter
|
||||||
|
|
||||||
def _sub(pattern, template, text, count=0):
|
|
||||||
# internal: pattern.sub implementation hook
|
|
||||||
# FIXME: not used in SRE 2.2.1 and later; will be removed soon
|
|
||||||
return _subn(pattern, template, text, count)[0]
|
|
||||||
|
|
||||||
def _subn(pattern, template, text, count=0):
|
|
||||||
# internal: pattern.subn implementation hook
|
|
||||||
# FIXME: not used in SRE 2.2.1 and later; will be removed soon
|
|
||||||
filter = _subx(pattern, template)
|
|
||||||
if not callable(filter):
|
|
||||||
# literal replacement
|
|
||||||
def filter(match, literal=filter):
|
|
||||||
return literal
|
|
||||||
n = i = 0
|
|
||||||
s = []
|
|
||||||
append = s.append
|
|
||||||
c = pattern.scanner(text)
|
|
||||||
while not count or n < count:
|
|
||||||
m = c.search()
|
|
||||||
if not m:
|
|
||||||
break
|
|
||||||
b, e = m.span()
|
|
||||||
if i < b:
|
|
||||||
append(text[i:b])
|
|
||||||
elif i == b == e and n:
|
|
||||||
append(text[i:b])
|
|
||||||
continue # ignore empty match at previous position
|
|
||||||
append(filter(m))
|
|
||||||
i = e
|
|
||||||
n = n + 1
|
|
||||||
append(text[i:])
|
|
||||||
return _join(s, text[:0]), n
|
|
||||||
|
|
||||||
def _split(pattern, text, maxsplit=0):
|
|
||||||
# internal: pattern.split implementation hook
|
|
||||||
# FIXME: not used in SRE 2.2.1 and later; will be removed soon
|
|
||||||
n = i = 0
|
|
||||||
s = []
|
|
||||||
append = s.append
|
|
||||||
extend = s.extend
|
|
||||||
c = pattern.scanner(text)
|
|
||||||
g = pattern.groups
|
|
||||||
while not maxsplit or n < maxsplit:
|
|
||||||
m = c.search()
|
|
||||||
if not m:
|
|
||||||
break
|
|
||||||
b, e = m.span()
|
|
||||||
if b == e:
|
|
||||||
if i >= len(text):
|
|
||||||
break
|
|
||||||
continue
|
|
||||||
append(text[i:b])
|
|
||||||
if g and b != e:
|
|
||||||
extend(list(m.groups()))
|
|
||||||
i = e
|
|
||||||
n = n + 1
|
|
||||||
append(text[i:])
|
|
||||||
return s
|
|
||||||
|
|
||||||
# register myself for pickling
|
# register myself for pickling
|
||||||
|
|
||||||
import copy_reg
|
import copy_reg
|
||||||
|
|
|
@ -76,10 +76,6 @@ static char copyright[] =
|
||||||
/* -------------------------------------------------------------------- */
|
/* -------------------------------------------------------------------- */
|
||||||
/* optional features */
|
/* optional features */
|
||||||
|
|
||||||
/* test: define to use sre.py helpers instead of C code */
|
|
||||||
#undef USE_PYTHON_SPLIT
|
|
||||||
#undef USE_PYTHON_SUB
|
|
||||||
|
|
||||||
/* prevent run-away recursion (bad patterns on long strings) */
|
/* prevent run-away recursion (bad patterns on long strings) */
|
||||||
|
|
||||||
#if !defined(USE_STACKCHECK)
|
#if !defined(USE_STACKCHECK)
|
||||||
|
@ -1251,6 +1247,8 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
|
||||||
TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
|
TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
|
||||||
state->start = ptr;
|
state->start = ptr;
|
||||||
state->ptr = ++ptr;
|
state->ptr = ++ptr;
|
||||||
|
if (flags & SRE_INFO_LITERAL)
|
||||||
|
return 1; /* we got all of it */
|
||||||
status = SRE_MATCH(state, pattern + 2, 1);
|
status = SRE_MATCH(state, pattern + 2, 1);
|
||||||
if (status != 0)
|
if (status != 0)
|
||||||
break;
|
break;
|
||||||
|
@ -1820,66 +1818,6 @@ join(PyObject* list, PyObject* pattern)
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#ifdef USE_PYTHON_SUB
|
|
||||||
static PyObject*
|
|
||||||
pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
|
|
||||||
{
|
|
||||||
PyObject* template;
|
|
||||||
PyObject* string;
|
|
||||||
PyObject* count = Py_False; /* zero */
|
|
||||||
static char* kwlist[] = { "repl", "string", "count", NULL };
|
|
||||||
if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|O:sub", kwlist,
|
|
||||||
&template, &string, &count))
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
/* delegate to Python code */
|
|
||||||
return call(
|
|
||||||
SRE_MODULE, "_sub",
|
|
||||||
Py_BuildValue("OOOO", self, template, string, count)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef USE_PYTHON_SUB
|
|
||||||
static PyObject*
|
|
||||||
pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
|
|
||||||
{
|
|
||||||
PyObject* template;
|
|
||||||
PyObject* string;
|
|
||||||
PyObject* count = Py_False; /* zero */
|
|
||||||
static char* kwlist[] = { "repl", "string", "count", NULL };
|
|
||||||
if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|O:subn", kwlist,
|
|
||||||
&template, &string, &count))
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
/* delegate to Python code */
|
|
||||||
return call(
|
|
||||||
SRE_MODULE, "_subn",
|
|
||||||
Py_BuildValue("OOOO", self, template, string, count)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(USE_PYTHON_SPLIT)
|
|
||||||
static PyObject*
|
|
||||||
pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
|
|
||||||
{
|
|
||||||
PyObject* string;
|
|
||||||
PyObject* maxsplit = Py_False; /* zero */
|
|
||||||
static char* kwlist[] = { "source", "maxsplit", NULL };
|
|
||||||
if (!PyArg_ParseTupleAndKeywords(args, kw, "O|O:split", kwlist,
|
|
||||||
&string, &maxsplit))
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
/* delegate to Python code */
|
|
||||||
return call(
|
|
||||||
SRE_MODULE, "_split",
|
|
||||||
Py_BuildValue("OOO", self, string, maxsplit)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static PyObject*
|
static PyObject*
|
||||||
pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
|
pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
|
||||||
{
|
{
|
||||||
|
@ -1980,7 +1918,6 @@ error:
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if !defined(USE_PYTHON_SPLIT)
|
|
||||||
static PyObject*
|
static PyObject*
|
||||||
pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
|
pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
|
||||||
{
|
{
|
||||||
|
@ -2071,15 +2008,16 @@ pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* get segment following last match */
|
/* get segment following last match */
|
||||||
item = PySequence_GetSlice(
|
i = STATE_OFFSET(&state, last);
|
||||||
string, STATE_OFFSET(&state, last), state.endpos
|
if (i < state.endpos) {
|
||||||
);
|
item = PySequence_GetSlice(string, i, state.endpos);
|
||||||
if (!item)
|
if (!item)
|
||||||
goto error;
|
goto error;
|
||||||
status = PyList_Append(list, item);
|
status = PyList_Append(list, item);
|
||||||
Py_DECREF(item);
|
Py_DECREF(item);
|
||||||
if (status < 0)
|
if (status < 0)
|
||||||
goto error;
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
state_fini(&state);
|
state_fini(&state);
|
||||||
return list;
|
return list;
|
||||||
|
@ -2090,9 +2028,7 @@ error:
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
#if !defined(USE_PYTHON_SUB)
|
|
||||||
static PyObject*
|
static PyObject*
|
||||||
pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
|
pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
|
||||||
int count, int subn)
|
int count, int subn)
|
||||||
|
@ -2108,15 +2044,22 @@ pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
|
||||||
int i, b, e;
|
int i, b, e;
|
||||||
int filter_is_callable;
|
int filter_is_callable;
|
||||||
|
|
||||||
/* call subx helper to get the filter */
|
if (PyCallable_Check(template)) {
|
||||||
|
/* sub/subn takes either a function or a template */
|
||||||
|
filter = template;
|
||||||
|
Py_INCREF(filter);
|
||||||
|
filter_is_callable = 1;
|
||||||
|
} else {
|
||||||
|
/* if not callable, call the template compiler. it may return
|
||||||
|
either a filter function or a literal string */
|
||||||
filter = call(
|
filter = call(
|
||||||
SRE_MODULE, "_subx",
|
SRE_MODULE, "_subx",
|
||||||
Py_BuildValue("OO", self, template)
|
Py_BuildValue("OO", self, template)
|
||||||
);
|
);
|
||||||
if (!filter)
|
if (!filter)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
filter_is_callable = PyCallable_Check(filter);
|
filter_is_callable = PyCallable_Check(filter);
|
||||||
|
}
|
||||||
|
|
||||||
string = state_init(&state, self, string, 0, INT_MAX);
|
string = state_init(&state, self, string, 0, INT_MAX);
|
||||||
if (!string)
|
if (!string)
|
||||||
|
@ -2169,7 +2112,7 @@ pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
|
||||||
goto next;
|
goto next;
|
||||||
|
|
||||||
if (filter_is_callable) {
|
if (filter_is_callable) {
|
||||||
/* filter match */
|
/* pass match object through filter */
|
||||||
match = pattern_new_match(self, &state, 1);
|
match = pattern_new_match(self, &state, 1);
|
||||||
if (!match)
|
if (!match)
|
||||||
goto error;
|
goto error;
|
||||||
|
@ -2186,7 +2129,7 @@ pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
|
||||||
} else {
|
} else {
|
||||||
/* filter is literal string */
|
/* filter is literal string */
|
||||||
item = filter;
|
item = filter;
|
||||||
Py_INCREF(filter);
|
Py_INCREF(item);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* add to list */
|
/* add to list */
|
||||||
|
@ -2208,6 +2151,7 @@ next:
|
||||||
}
|
}
|
||||||
|
|
||||||
/* get segment following last match */
|
/* get segment following last match */
|
||||||
|
if (i < state.endpos) {
|
||||||
item = PySequence_GetSlice(string, i, state.endpos);
|
item = PySequence_GetSlice(string, i, state.endpos);
|
||||||
if (!item)
|
if (!item)
|
||||||
goto error;
|
goto error;
|
||||||
|
@ -2215,11 +2159,13 @@ next:
|
||||||
Py_DECREF(item);
|
Py_DECREF(item);
|
||||||
if (status < 0)
|
if (status < 0)
|
||||||
goto error;
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
state_fini(&state);
|
state_fini(&state);
|
||||||
|
|
||||||
/* convert list to single string */
|
/* convert list to single string (also removes list) */
|
||||||
item = join(list, self->pattern);
|
item = join(list, self->pattern);
|
||||||
|
|
||||||
if (!item)
|
if (!item)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
@ -2262,7 +2208,6 @@ pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
|
||||||
|
|
||||||
return pattern_subx(self, template, string, count, 1);
|
return pattern_subx(self, template, string, count, 1);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
static PyObject*
|
static PyObject*
|
||||||
pattern_copy(PatternObject* self, PyObject* args)
|
pattern_copy(PatternObject* self, PyObject* args)
|
||||||
|
|
Loading…
Reference in New Issue