fixed character set description in docstring (SRE uses Python
strings, not C strings) removed USE_PYTHON defines, and related sre.py helpers skip calling the subx helper if the template is callable. interestingly enough, this means that def callback(m): return literal result = pattern.sub(callback, string) is much faster than result = pattern.sub(literal, string)
This commit is contained in:
parent
0402dd18cb
commit
dac58492aa
93
Lib/sre.py
93
Lib/sre.py
|
@ -17,15 +17,13 @@
|
|||
r"""Support for regular expressions (RE).
|
||||
|
||||
This module provides regular expression matching operations similar to
|
||||
those found in Perl. It's 8-bit clean: the strings being processed may
|
||||
contain both null bytes and characters whose high bit is set. Regular
|
||||
expression pattern strings may not contain null bytes, but can specify
|
||||
the null byte using the \\number notation. Characters with the high
|
||||
bit set may be included.
|
||||
those found in Perl. It supports both 8-bit and Unicode strings; both
|
||||
the pattern and the strings being processed can contain null bytes and
|
||||
characters outside the US ASCII range.
|
||||
|
||||
Regular expressions can contain both special and ordinary
|
||||
characters. Most ordinary characters, like "A", "a", or "0", are the
|
||||
simplest regular expressions; they simply match themselves. You can
|
||||
Regular expressions can contain both special and ordinary characters.
|
||||
Most ordinary characters, like "A", "a", or "0", are the simplest
|
||||
regular expressions; they simply match themselves. You can
|
||||
concatenate ordinary characters, so last matches the string 'last'.
|
||||
|
||||
The special characters are:
|
||||
|
@ -45,7 +43,7 @@ The special characters are:
|
|||
"|" A|B, creates an RE that will match either A or B.
|
||||
(...) Matches the RE inside the parentheses.
|
||||
The contents can be retrieved or matched later in the string.
|
||||
(?iLmsx) Set the I, L, M, S, or X flag for the RE (see below).
|
||||
(?iLmsux) Set the I, L, M, S, U, or X flag for the RE (see below).
|
||||
(?:...) Non-grouping version of regular parentheses.
|
||||
(?P<name>...) The substring matched by the group is accessible by name.
|
||||
(?P=name) Matches the text matched earlier by the group named name.
|
||||
|
@ -54,7 +52,7 @@ The special characters are:
|
|||
(?!...) Matches if ... doesn't match next.
|
||||
|
||||
The special sequences consist of "\\" and a character from the list
|
||||
below. If the ordinary character is not on the list, then the
|
||||
below. If the ordinary character is not on the list, then the
|
||||
resulting RE will match the second character.
|
||||
\number Matches the contents of the group of the same number.
|
||||
\A Matches only at the start of the string.
|
||||
|
@ -246,76 +244,13 @@ def _expand(pattern, match, template):
|
|||
|
||||
def _subx(pattern, template):
|
||||
# internal: pattern.sub/subn implementation helper
|
||||
if callable(template):
|
||||
filter = template
|
||||
else:
|
||||
template = _compile_repl(template, pattern)
|
||||
if not template[0] and len(template[1]) == 1:
|
||||
# literal replacement
|
||||
filter = template[1][0]
|
||||
else:
|
||||
def filter(match, template=template):
|
||||
return sre_parse.expand_template(template, match)
|
||||
return filter
|
||||
|
||||
def _sub(pattern, template, text, count=0):
|
||||
# internal: pattern.sub implementation hook
|
||||
# FIXME: not used in SRE 2.2.1 and later; will be removed soon
|
||||
return _subn(pattern, template, text, count)[0]
|
||||
|
||||
def _subn(pattern, template, text, count=0):
|
||||
# internal: pattern.subn implementation hook
|
||||
# FIXME: not used in SRE 2.2.1 and later; will be removed soon
|
||||
filter = _subx(pattern, template)
|
||||
if not callable(filter):
|
||||
template = _compile_repl(template, pattern)
|
||||
if not template[0] and len(template[1]) == 1:
|
||||
# literal replacement
|
||||
def filter(match, literal=filter):
|
||||
return literal
|
||||
n = i = 0
|
||||
s = []
|
||||
append = s.append
|
||||
c = pattern.scanner(text)
|
||||
while not count or n < count:
|
||||
m = c.search()
|
||||
if not m:
|
||||
break
|
||||
b, e = m.span()
|
||||
if i < b:
|
||||
append(text[i:b])
|
||||
elif i == b == e and n:
|
||||
append(text[i:b])
|
||||
continue # ignore empty match at previous position
|
||||
append(filter(m))
|
||||
i = e
|
||||
n = n + 1
|
||||
append(text[i:])
|
||||
return _join(s, text[:0]), n
|
||||
|
||||
def _split(pattern, text, maxsplit=0):
|
||||
# internal: pattern.split implementation hook
|
||||
# FIXME: not used in SRE 2.2.1 and later; will be removed soon
|
||||
n = i = 0
|
||||
s = []
|
||||
append = s.append
|
||||
extend = s.extend
|
||||
c = pattern.scanner(text)
|
||||
g = pattern.groups
|
||||
while not maxsplit or n < maxsplit:
|
||||
m = c.search()
|
||||
if not m:
|
||||
break
|
||||
b, e = m.span()
|
||||
if b == e:
|
||||
if i >= len(text):
|
||||
break
|
||||
continue
|
||||
append(text[i:b])
|
||||
if g and b != e:
|
||||
extend(list(m.groups()))
|
||||
i = e
|
||||
n = n + 1
|
||||
append(text[i:])
|
||||
return s
|
||||
return template[1][0]
|
||||
def filter(match, template=template):
|
||||
return sre_parse.expand_template(template, match)
|
||||
return filter
|
||||
|
||||
# register myself for pickling
|
||||
|
||||
|
|
137
Modules/_sre.c
137
Modules/_sre.c
|
@ -76,10 +76,6 @@ static char copyright[] =
|
|||
/* -------------------------------------------------------------------- */
|
||||
/* optional features */
|
||||
|
||||
/* test: define to use sre.py helpers instead of C code */
|
||||
#undef USE_PYTHON_SPLIT
|
||||
#undef USE_PYTHON_SUB
|
||||
|
||||
/* prevent run-away recursion (bad patterns on long strings) */
|
||||
|
||||
#if !defined(USE_STACKCHECK)
|
||||
|
@ -1251,6 +1247,8 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
|
|||
TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
|
||||
state->start = ptr;
|
||||
state->ptr = ++ptr;
|
||||
if (flags & SRE_INFO_LITERAL)
|
||||
return 1; /* we got all of it */
|
||||
status = SRE_MATCH(state, pattern + 2, 1);
|
||||
if (status != 0)
|
||||
break;
|
||||
|
@ -1820,66 +1818,6 @@ join(PyObject* list, PyObject* pattern)
|
|||
return result;
|
||||
}
|
||||
|
||||
|
||||
#ifdef USE_PYTHON_SUB
|
||||
static PyObject*
|
||||
pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
|
||||
{
|
||||
PyObject* template;
|
||||
PyObject* string;
|
||||
PyObject* count = Py_False; /* zero */
|
||||
static char* kwlist[] = { "repl", "string", "count", NULL };
|
||||
if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|O:sub", kwlist,
|
||||
&template, &string, &count))
|
||||
return NULL;
|
||||
|
||||
/* delegate to Python code */
|
||||
return call(
|
||||
SRE_MODULE, "_sub",
|
||||
Py_BuildValue("OOOO", self, template, string, count)
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_PYTHON_SUB
|
||||
static PyObject*
|
||||
pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
|
||||
{
|
||||
PyObject* template;
|
||||
PyObject* string;
|
||||
PyObject* count = Py_False; /* zero */
|
||||
static char* kwlist[] = { "repl", "string", "count", NULL };
|
||||
if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|O:subn", kwlist,
|
||||
&template, &string, &count))
|
||||
return NULL;
|
||||
|
||||
/* delegate to Python code */
|
||||
return call(
|
||||
SRE_MODULE, "_subn",
|
||||
Py_BuildValue("OOOO", self, template, string, count)
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(USE_PYTHON_SPLIT)
|
||||
static PyObject*
|
||||
pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
|
||||
{
|
||||
PyObject* string;
|
||||
PyObject* maxsplit = Py_False; /* zero */
|
||||
static char* kwlist[] = { "source", "maxsplit", NULL };
|
||||
if (!PyArg_ParseTupleAndKeywords(args, kw, "O|O:split", kwlist,
|
||||
&string, &maxsplit))
|
||||
return NULL;
|
||||
|
||||
/* delegate to Python code */
|
||||
return call(
|
||||
SRE_MODULE, "_split",
|
||||
Py_BuildValue("OOO", self, string, maxsplit)
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
static PyObject*
|
||||
pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
|
||||
{
|
||||
|
@ -1980,7 +1918,6 @@ error:
|
|||
|
||||
}
|
||||
|
||||
#if !defined(USE_PYTHON_SPLIT)
|
||||
static PyObject*
|
||||
pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
|
||||
{
|
||||
|
@ -2071,15 +2008,16 @@ pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
|
|||
}
|
||||
|
||||
/* get segment following last match */
|
||||
item = PySequence_GetSlice(
|
||||
string, STATE_OFFSET(&state, last), state.endpos
|
||||
);
|
||||
if (!item)
|
||||
goto error;
|
||||
status = PyList_Append(list, item);
|
||||
Py_DECREF(item);
|
||||
if (status < 0)
|
||||
goto error;
|
||||
i = STATE_OFFSET(&state, last);
|
||||
if (i < state.endpos) {
|
||||
item = PySequence_GetSlice(string, i, state.endpos);
|
||||
if (!item)
|
||||
goto error;
|
||||
status = PyList_Append(list, item);
|
||||
Py_DECREF(item);
|
||||
if (status < 0)
|
||||
goto error;
|
||||
}
|
||||
|
||||
state_fini(&state);
|
||||
return list;
|
||||
|
@ -2090,9 +2028,7 @@ error:
|
|||
return NULL;
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !defined(USE_PYTHON_SUB)
|
||||
static PyObject*
|
||||
pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
|
||||
int count, int subn)
|
||||
|
@ -2108,15 +2044,22 @@ pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
|
|||
int i, b, e;
|
||||
int filter_is_callable;
|
||||
|
||||
/* call subx helper to get the filter */
|
||||
filter = call(
|
||||
SRE_MODULE, "_subx",
|
||||
Py_BuildValue("OO", self, template)
|
||||
);
|
||||
if (!filter)
|
||||
return NULL;
|
||||
|
||||
filter_is_callable = PyCallable_Check(filter);
|
||||
if (PyCallable_Check(template)) {
|
||||
/* sub/subn takes either a function or a template */
|
||||
filter = template;
|
||||
Py_INCREF(filter);
|
||||
filter_is_callable = 1;
|
||||
} else {
|
||||
/* if not callable, call the template compiler. it may return
|
||||
either a filter function or a literal string */
|
||||
filter = call(
|
||||
SRE_MODULE, "_subx",
|
||||
Py_BuildValue("OO", self, template)
|
||||
);
|
||||
if (!filter)
|
||||
return NULL;
|
||||
filter_is_callable = PyCallable_Check(filter);
|
||||
}
|
||||
|
||||
string = state_init(&state, self, string, 0, INT_MAX);
|
||||
if (!string)
|
||||
|
@ -2169,7 +2112,7 @@ pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
|
|||
goto next;
|
||||
|
||||
if (filter_is_callable) {
|
||||
/* filter match */
|
||||
/* pass match object through filter */
|
||||
match = pattern_new_match(self, &state, 1);
|
||||
if (!match)
|
||||
goto error;
|
||||
|
@ -2186,7 +2129,7 @@ pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
|
|||
} else {
|
||||
/* filter is literal string */
|
||||
item = filter;
|
||||
Py_INCREF(filter);
|
||||
Py_INCREF(item);
|
||||
}
|
||||
|
||||
/* add to list */
|
||||
|
@ -2208,18 +2151,21 @@ next:
|
|||
}
|
||||
|
||||
/* get segment following last match */
|
||||
item = PySequence_GetSlice(string, i, state.endpos);
|
||||
if (!item)
|
||||
goto error;
|
||||
status = PyList_Append(list, item);
|
||||
Py_DECREF(item);
|
||||
if (status < 0)
|
||||
goto error;
|
||||
if (i < state.endpos) {
|
||||
item = PySequence_GetSlice(string, i, state.endpos);
|
||||
if (!item)
|
||||
goto error;
|
||||
status = PyList_Append(list, item);
|
||||
Py_DECREF(item);
|
||||
if (status < 0)
|
||||
goto error;
|
||||
}
|
||||
|
||||
state_fini(&state);
|
||||
|
||||
/* convert list to single string */
|
||||
/* convert list to single string (also removes list) */
|
||||
item = join(list, self->pattern);
|
||||
|
||||
if (!item)
|
||||
return NULL;
|
||||
|
||||
|
@ -2262,7 +2208,6 @@ pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
|
|||
|
||||
return pattern_subx(self, template, string, count, 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
static PyObject*
|
||||
pattern_copy(PatternObject* self, PyObject* args)
|
||||
|
|
Loading…
Reference in New Issue