bpo-25054, bpo-1647489: Added support of splitting on zerowidth patterns. (#4471)

Also fixed searching patterns that could match an empty string.
This commit is contained in:
Serhiy Storchaka 2017-12-04 14:29:05 +02:00 committed by GitHub
parent e69fbb6a56
commit 70d56fb525
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 128 additions and 117 deletions

View File

@ -708,37 +708,19 @@ form.
That way, separator components are always found at the same relative
indices within the result list.
.. note::
The pattern can match empty strings. ::
:func:`split` doesn't currently split a string on an empty pattern match.
For example::
>>> re.split('x*', 'axbc')
['a', 'bc']
Even though ``'x*'`` also matches 0 'x' before 'a', between 'b' and 'c',
and after 'c', currently these matches are ignored. The correct behavior
(i.e. splitting on empty matches too and returning ``['', 'a', 'b', 'c',
'']``) will be implemented in future versions of Python, but since this
is a backward incompatible change, a :exc:`FutureWarning` will be raised
in the meanwhile.
Patterns that can only match empty strings currently never split the
string. Since this doesn't match the expected behavior, a
:exc:`ValueError` will be raised starting from Python 3.5::
>>> re.split("^$", "foo\n\nbar\n", flags=re.M)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
...
ValueError: split() requires a non-empty pattern match.
>>> re.split(r'\b', 'Words, words, words.')
['', 'Words', ', ', 'words', ', ', 'words', '.']
>>> re.split(r'(\W*)', '...words...')
['', '...', 'w', '', 'o', '', 'r', '', 'd', '', 's', '...', '']
.. versionchanged:: 3.1
Added the optional flags argument.
.. versionchanged:: 3.5
Splitting on a pattern that could match an empty string now raises
a warning. Patterns that can only match empty strings are now rejected.
.. versionchanged:: 3.7
Added support of splitting on a pattern that could match an empty string.
.. function:: findall(pattern, string, flags=0)
@ -746,8 +728,10 @@ form.
strings. The *string* is scanned left-to-right, and matches are returned in
the order found. If one or more groups are present in the pattern, return a
list of groups; this will be a list of tuples if the pattern has more than
one group. Empty matches are included in the result unless they touch the
beginning of another match.
one group. Empty matches are included in the result.
.. versionchanged:: 3.7
Non-empty matches can now start just after a previous empty match.
.. function:: finditer(pattern, string, flags=0)
@ -755,8 +739,10 @@ form.
Return an :term:`iterator` yielding :ref:`match objects <match-objects>` over
all non-overlapping matches for the RE *pattern* in *string*. The *string*
is scanned left-to-right, and matches are returned in the order found. Empty
matches are included in the result unless they touch the beginning of another
match.
matches are included in the result.
.. versionchanged:: 3.7
Non-empty matches can now start just after a previous empty match.
.. function:: sub(pattern, repl, string, count=0, flags=0)

View File

@ -364,6 +364,10 @@ The flags :const:`re.ASCII`, :const:`re.LOCALE` and :const:`re.UNICODE`
can be set within the scope of a group.
(Contributed by Serhiy Storchaka in :issue:`31690`.)
:func:`re.split` now supports splitting on a pattern like ``r'\b'``,
``'^$'`` or ``(?=-)`` that matches an empty string.
(Contributed by Serhiy Storchaka in :issue:`25054`.)
string
------
@ -768,6 +772,23 @@ Changes in the Python API
avoid a warning escape them with a backslash.
(Contributed by Serhiy Storchaka in :issue:`30349`.)
* The result of splitting a string on a :mod:`regular expression <re>`
that could match an empty string has been changed. For example
splitting on ``r'\s*'`` will now split not only on whitespaces as it
did previously, but also between any pair of non-whitespace
characters. The previous behavior can be restored by changing the pattern
to ``r'\s+'``. A :exc:`FutureWarning` was emitted for such patterns since
Python 3.5.
For patterns that match both empty and non-empty strings, the result of
searching for all matches may also be changed in other cases. For example
in the string ``'a\n\n'``, the pattern ``r'(?m)^\s*?$'`` will not only
match empty strings at positions 2 and 3, but also the string ``'\n'`` at
positions 2--3. To match only blank lines, the pattern should be rewritten
as ``r'(?m)^[^\S\n]*$'``.
(Contributed by Serhiy Storchaka in :issue:`25054`.)
* :class:`tracemalloc.Traceback` frames are now sorted from oldest to most
recent to be more consistent with :mod:`traceback`.
(Contributed by Jesse Bakker in :issue:`32121`.)

View File

@ -1611,7 +1611,7 @@ class OutputChecker:
'', want)
# If a line in got contains only spaces, then remove the
# spaces.
got = re.sub(r'(?m)^\s*?$', '', got)
got = re.sub(r'(?m)^[^\S\n]+$', '', got)
if got == want:
return True

View File

@ -331,21 +331,21 @@ class ReTests(unittest.TestCase):
['', 'a', '', '', 'c'])
for sep, expected in [
(':*', ['', 'a', 'b', 'c']),
('(?::*)', ['', 'a', 'b', 'c']),
('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']),
('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']),
(':*', ['', 'a', 'b', 'c', '']),
('(?::*)', ['', 'a', 'b', 'c', '']),
('(:*)', ['', ':', 'a', ':', 'b', '::', 'c', '', '']),
('(:)*', ['', ':', 'a', ':', 'b', ':', 'c', None, '']),
]:
with self.subTest(sep=sep), self.assertWarns(FutureWarning):
with self.subTest(sep=sep):
self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
for sep, expected in [
('', [':a:b::c']),
(r'\b', [':a:b::c']),
(r'(?=:)', [':a:b::c']),
(r'(?<=:)', [':a:b::c']),
('', ['', ':', 'a', ':', 'b', ':', ':', 'c', '']),
(r'\b', [':', 'a', ':', 'b', '::', 'c', '']),
(r'(?=:)', ['', ':a', ':b', ':', ':c']),
(r'(?<=:)', [':', 'a:', 'b:', ':', 'c']),
]:
with self.subTest(sep=sep), self.assertRaises(ValueError):
with self.subTest(sep=sep):
self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
def test_qualified_re_split(self):
@ -356,9 +356,8 @@ class ReTests(unittest.TestCase):
['', ':', 'a', ':', 'b::c'])
self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
['', ':', 'a', ':', 'b::c'])
with self.assertWarns(FutureWarning):
self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
['', ':', 'a', ':', 'b::c'])
self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
['', ':', 'a', ':', 'b::c'])
def test_re_findall(self):
self.assertEqual(re.findall(":+", "abc"), [])
@ -1751,6 +1750,25 @@ class ReTests(unittest.TestCase):
"span=(3, 5), match='bb'>" %
(type(second).__module__, type(second).__qualname__))
def test_zerowidth(self):
# Issues 852532, 1647489, 3262, 25054.
self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', ''])
self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', 'bc', ''])
self.assertEqual(re.split(r"(?<!\w)(?=\w)|:+", "a::bc"), ['', 'a', 'bc'])
self.assertEqual(re.split(r"(?<=\w)(?!\w)|:+", "a::bc"), ['a', '', 'bc', ''])
self.assertEqual(re.sub(r"\b", "-", "a::bc"), '-a-::-bc-')
self.assertEqual(re.sub(r"\b|:+", "-", "a::bc"), '-a--bc-')
self.assertEqual(re.sub(r"(\b|:+)", r"[\1]", "a::bc"), '[]a[][::]bc[]')
self.assertEqual(re.findall(r"\b|:+", "a::bc"), ['', '', '::', '', ''])
self.assertEqual(re.findall(r"\b|\w+", "a::bc"),
['', 'a', '', '', 'bc', ''])
self.assertEqual([m.span() for m in re.finditer(r"\b|:+", "a::bc")],
[(0, 0), (1, 1), (1, 3), (3, 3), (5, 5)])
self.assertEqual([m.span() for m in re.finditer(r"\b|\w+", "a::bc")],
[(0, 0), (0, 1), (1, 1), (3, 3), (3, 5), (5, 5)])
def test_bug_2537(self):
# issue 2537: empty submatches

View File

@ -0,0 +1 @@
Added support of splitting on a pattern that could match an empty string.

View File

@ -0,0 +1,3 @@
Fixed searching regular expression patterns that could match an empty
string. Non-empty string can now be correctly found after matching an empty
string.

View File

@ -446,6 +446,8 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
state->isbytes = isbytes;
state->charsize = charsize;
state->match_all = 0;
state->must_advance = 0;
state->beginning = ptr;
@ -559,14 +561,14 @@ pattern_dealloc(PatternObject* self)
}
LOCAL(Py_ssize_t)
sre_match(SRE_STATE* state, SRE_CODE* pattern, int match_all)
sre_match(SRE_STATE* state, SRE_CODE* pattern)
{
if (state->charsize == 1)
return sre_ucs1_match(state, pattern, match_all);
return sre_ucs1_match(state, pattern, 1);
if (state->charsize == 2)
return sre_ucs2_match(state, pattern, match_all);
return sre_ucs2_match(state, pattern, 1);
assert(state->charsize == 4);
return sre_ucs4_match(state, pattern, match_all);
return sre_ucs4_match(state, pattern, 1);
}
LOCAL(Py_ssize_t)
@ -606,7 +608,7 @@ _sre_SRE_Pattern_match_impl(PatternObject *self, PyObject *string,
TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
status = sre_match(&state, PatternObject_GetCode(self), 0);
status = sre_match(&state, PatternObject_GetCode(self));
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
if (PyErr_Occurred()) {
@ -645,7 +647,8 @@ _sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyObject *string,
TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
status = sre_match(&state, PatternObject_GetCode(self), 1);
state.match_all = 1;
status = sre_match(&state, PatternObject_GetCode(self));
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
if (PyErr_Occurred()) {
@ -808,11 +811,8 @@ _sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string,
if (status < 0)
goto error;
if (state.ptr == state.start)
state.start = (void*) ((char*) state.ptr + state.charsize);
else
state.start = state.ptr;
state.must_advance = (state.ptr == state.start);
state.start = state.ptr;
}
state_fini(&state);
@ -901,17 +901,6 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
void* last;
assert(self->codesize != 0);
if (self->code[0] != SRE_OP_INFO || self->code[3] == 0) {
if (self->code[0] == SRE_OP_INFO && self->code[4] == 0) {
PyErr_SetString(PyExc_ValueError,
"split() requires a non-empty pattern match.");
return NULL;
}
if (PyErr_WarnEx(PyExc_FutureWarning,
"split() requires a non-empty pattern match.",
1) < 0)
return NULL;
}
if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX))
return NULL;
@ -942,14 +931,6 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
goto error;
}
if (state.start == state.ptr) {
if (last == state.end || state.ptr == state.end)
break;
/* skip one character */
state.start = (void*) ((char*) state.ptr + state.charsize);
continue;
}
/* get segment before this match */
item = getslice(state.isbytes, state.beginning,
string, STATE_OFFSET(&state, last),
@ -974,7 +955,7 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
}
n = n + 1;
state.must_advance = 1;
last = state.start = state.ptr;
}
@ -1101,9 +1082,7 @@ pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
if (status < 0)
goto error;
} else if (i == b && i == e && n > 0)
/* ignore empty match on latest position */
goto next;
}
if (filter_is_callable) {
/* pass match object through filter */
@ -1130,16 +1109,8 @@ pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
i = e;
n = n + 1;
next:
/* move on */
if (state.ptr == state.end)
break;
if (state.ptr == state.start)
state.start = (void*) ((char*) state.ptr + state.charsize);
else
state.start = state.ptr;
state.must_advance = 1;
state.start = state.ptr;
}
/* get segment following last match */
@ -2450,7 +2421,7 @@ _sre_SRE_Scanner_match_impl(ScannerObject *self)
state->ptr = state->start;
status = sre_match(state, PatternObject_GetCode(self->pattern), 0);
status = sre_match(state, PatternObject_GetCode(self->pattern));
if (PyErr_Occurred())
return NULL;
@ -2459,12 +2430,10 @@ _sre_SRE_Scanner_match_impl(ScannerObject *self)
if (status == 0)
state->start = NULL;
else if (state->ptr != state->start)
else {
state->must_advance = (state->ptr == state->start);
state->start = state->ptr;
else if (state->ptr != state->end)
state->start = (void*) ((char*) state->ptr + state->charsize);
else
state->start = NULL;
}
return match;
}
@ -2499,12 +2468,10 @@ _sre_SRE_Scanner_search_impl(ScannerObject *self)
if (status == 0)
state->start = NULL;
else if (state->ptr != state->start)
else {
state->must_advance = (state->ptr == state->start);
state->start = state->ptr;
else if (state->ptr != state->end)
state->start = (void*) ((char*) state->ptr + state->charsize);
else
state->start = NULL;
}
return match;
}

View File

@ -67,6 +67,7 @@ typedef struct {
void* end; /* end of original string */
/* attributes for the match object */
PyObject* string;
Py_buffer buffer;
Py_ssize_t pos, endpos;
int isbytes;
int charsize; /* character size */
@ -74,11 +75,12 @@ typedef struct {
Py_ssize_t lastindex;
Py_ssize_t lastmark;
void** mark;
int match_all;
int must_advance;
/* dynamically allocated stuff */
char* data_stack;
size_t data_stack_size;
size_t data_stack_base;
Py_buffer buffer;
/* current repeat context */
SRE_REPEAT *repeat;
} SRE_STATE;

View File

@ -199,7 +199,7 @@ SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
return up != lo && SRE(charset)(state, set, up);
}
LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all);
LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel);
LOCAL(Py_ssize_t)
SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
@ -510,12 +510,12 @@ do { \
#define JUMP_ASSERT 12
#define JUMP_ASSERT_NOT 13
#define DO_JUMPX(jumpvalue, jumplabel, nextpattern, matchall) \
#define DO_JUMPX(jumpvalue, jumplabel, nextpattern, toplevel_) \
DATA_ALLOC(SRE(match_context), nextctx); \
nextctx->last_ctx_pos = ctx_pos; \
nextctx->jump = jumpvalue; \
nextctx->pattern = nextpattern; \
nextctx->match_all = matchall; \
nextctx->toplevel = toplevel_; \
ctx_pos = alloc_pos; \
ctx = nextctx; \
goto entrance; \
@ -523,7 +523,7 @@ do { \
while (0) /* gcc doesn't like labels at end of scopes */ \
#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
DO_JUMPX(jumpvalue, jumplabel, nextpattern, ctx->match_all)
DO_JUMPX(jumpvalue, jumplabel, nextpattern, ctx->toplevel)
#define DO_JUMP0(jumpvalue, jumplabel, nextpattern) \
DO_JUMPX(jumpvalue, jumplabel, nextpattern, 0)
@ -540,13 +540,13 @@ typedef struct {
SRE_CODE chr;
SRE_REPEAT* rep;
} u;
int match_all;
int toplevel;
} SRE(match_context);
/* check if string matches the given pattern. returns <0 for
error, 0 for failure, and 1 for success */
LOCAL(Py_ssize_t)
SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all)
SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel)
{
SRE_CHAR* end = (SRE_CHAR *)state->end;
Py_ssize_t alloc_pos, ctx_pos = -1;
@ -563,7 +563,7 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all)
ctx->last_ctx_pos = -1;
ctx->jump = JUMP_NONE;
ctx->pattern = pattern;
ctx->match_all = match_all;
ctx->toplevel = toplevel;
ctx_pos = alloc_pos;
entrance:
@ -636,11 +636,14 @@ entrance:
case SRE_OP_SUCCESS:
/* end of pattern */
TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
if (!ctx->match_all || ctx->ptr == state->end) {
state->ptr = ctx->ptr;
RETURN_SUCCESS;
if (ctx->toplevel &&
((state->match_all && ctx->ptr != state->end) ||
(state->must_advance && ctx->ptr == state->start)))
{
RETURN_FAILURE;
}
RETURN_FAILURE;
state->ptr = ctx->ptr;
RETURN_SUCCESS;
case SRE_OP_AT:
/* match at given position */
@ -856,7 +859,9 @@ entrance:
RETURN_FAILURE;
if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS &&
ctx->ptr == state->end) {
ctx->ptr == state->end &&
!(ctx->toplevel && state->must_advance && ctx->ptr == state->start))
{
/* tail is empty. we're finished */
state->ptr = ctx->ptr;
RETURN_SUCCESS;
@ -941,7 +946,10 @@ entrance:
}
if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS &&
(!match_all || ctx->ptr == state->end)) {
!(ctx->toplevel &&
((state->match_all && ctx->ptr != state->end) ||
(state->must_advance && ctx->ptr == state->start))))
{
/* tail is empty. we're finished */
state->ptr = ctx->ptr;
RETURN_SUCCESS;
@ -1417,6 +1425,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
return 0; /* literal can't match: doesn't fit in char width */
#endif
end = (SRE_CHAR *)state->end;
state->must_advance = 0;
while (ptr < end) {
while (*ptr != c) {
if (++ptr >= end)
@ -1458,6 +1467,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
return 0;
i = 1;
state->must_advance = 0;
do {
if (*ptr == (SRE_CHAR) prefix[i]) {
if (++i != prefix_len) {
@ -1487,6 +1497,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
if (charset) {
/* pattern starts with a character from a known set */
end = (SRE_CHAR *)state->end;
state->must_advance = 0;
for (;;) {
while (ptr < end && !SRE(charset)(state, charset, *ptr))
ptr++;
@ -1503,13 +1514,15 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
} else {
/* general case */
assert(ptr <= end);
while (1) {
TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
state->start = state->ptr = ptr;
status = SRE(match)(state, pattern, 1);
state->must_advance = 0;
while (status == 0 && ptr < end) {
ptr++;
TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
state->start = state->ptr = ptr;
status = SRE(match)(state, pattern, 0);
if (status != 0 || ptr >= end)
break;
ptr++;
}
}