From 70d56fb52582d9d3f7c00860d6e90570c6259371 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 4 Dec 2017 14:29:05 +0200 Subject: [PATCH] bpo-25054, bpo-1647489: Added support of splitting on zerowidth patterns. (#4471) Also fixed searching patterns that could match an empty string. --- Doc/library/re.rst | 46 ++++------- Doc/whatsnew/3.7.rst | 21 +++++ Lib/doctest.py | 2 +- Lib/test/test_re.py | 44 +++++++---- .../2017-11-20-01-01-01.bpo-25054.rOlRV6.rst | 1 + ...2017-11-20-01-29-46.bpo-1647489.-ZNNkh.rst | 3 + Modules/_sre.c | 77 ++++++------------- Modules/sre.h | 4 +- Modules/sre_lib.h | 47 +++++++---- 9 files changed, 128 insertions(+), 117 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2017-11-20-01-01-01.bpo-25054.rOlRV6.rst create mode 100644 Misc/NEWS.d/next/Library/2017-11-20-01-29-46.bpo-1647489.-ZNNkh.rst diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 8e6eb30f836..dae1d7ea10a 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -708,37 +708,19 @@ form. That way, separator components are always found at the same relative indices within the result list. - .. note:: + The pattern can match empty strings. :: - :func:`split` doesn't currently split a string on an empty pattern match. - For example:: - - >>> re.split('x*', 'axbc') - ['a', 'bc'] - - Even though ``'x*'`` also matches 0 'x' before 'a', between 'b' and 'c', - and after 'c', currently these matches are ignored. The correct behavior - (i.e. splitting on empty matches too and returning ``['', 'a', 'b', 'c', - '']``) will be implemented in future versions of Python, but since this - is a backward incompatible change, a :exc:`FutureWarning` will be raised - in the meanwhile. - - Patterns that can only match empty strings currently never split the - string. Since this doesn't match the expected behavior, a - :exc:`ValueError` will be raised starting from Python 3.5:: - - >>> re.split("^$", "foo\n\nbar\n", flags=re.M) - Traceback (most recent call last): - File "", line 1, in - ... - ValueError: split() requires a non-empty pattern match. + >>> re.split(r'\b', 'Words, words, words.') + ['', 'Words', ', ', 'words', ', ', 'words', '.'] + >>> re.split(r'(\W*)', '...words...') + ['', '...', 'w', '', 'o', '', 'r', '', 'd', '', 's', '...', ''] .. versionchanged:: 3.1 Added the optional flags argument. - .. versionchanged:: 3.5 - Splitting on a pattern that could match an empty string now raises - a warning. Patterns that can only match empty strings are now rejected. + .. versionchanged:: 3.7 + Added support of splitting on a pattern that could match an empty string. + .. function:: findall(pattern, string, flags=0) @@ -746,8 +728,10 @@ form. strings. The *string* is scanned left-to-right, and matches are returned in the order found. If one or more groups are present in the pattern, return a list of groups; this will be a list of tuples if the pattern has more than - one group. Empty matches are included in the result unless they touch the - beginning of another match. + one group. Empty matches are included in the result. + + .. versionchanged:: 3.7 + Non-empty matches can now start just after a previous empty match. .. function:: finditer(pattern, string, flags=0) @@ -755,8 +739,10 @@ form. Return an :term:`iterator` yielding :ref:`match objects ` over all non-overlapping matches for the RE *pattern* in *string*. The *string* is scanned left-to-right, and matches are returned in the order found. Empty - matches are included in the result unless they touch the beginning of another - match. + matches are included in the result. + + .. versionchanged:: 3.7 + Non-empty matches can now start just after a previous empty match. .. function:: sub(pattern, repl, string, count=0, flags=0) diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst index b6dad4eab6b..3d23aa773d7 100644 --- a/Doc/whatsnew/3.7.rst +++ b/Doc/whatsnew/3.7.rst @@ -364,6 +364,10 @@ The flags :const:`re.ASCII`, :const:`re.LOCALE` and :const:`re.UNICODE` can be set within the scope of a group. (Contributed by Serhiy Storchaka in :issue:`31690`.) +:func:`re.split` now supports splitting on a pattern like ``r'\b'``, +``'^$'`` or ``(?=-)`` that matches an empty string. +(Contributed by Serhiy Storchaka in :issue:`25054`.) + string ------ @@ -768,6 +772,23 @@ Changes in the Python API avoid a warning escape them with a backslash. (Contributed by Serhiy Storchaka in :issue:`30349`.) +* The result of splitting a string on a :mod:`regular expression ` + that could match an empty string has been changed. For example + splitting on ``r'\s*'`` will now split not only on whitespaces as it + did previously, but also between any pair of non-whitespace + characters. The previous behavior can be restored by changing the pattern + to ``r'\s+'``. A :exc:`FutureWarning` was emitted for such patterns since + Python 3.5. + + For patterns that match both empty and non-empty strings, the result of + searching for all matches may also be changed in other cases. For example + in the string ``'a\n\n'``, the pattern ``r'(?m)^\s*?$'`` will not only + match empty strings at positions 2 and 3, but also the string ``'\n'`` at + positions 2--3. To match only blank lines, the pattern should be rewritten + as ``r'(?m)^[^\S\n]*$'``. + + (Contributed by Serhiy Storchaka in :issue:`25054`.) + * :class:`tracemalloc.Traceback` frames are now sorted from oldest to most recent to be more consistent with :mod:`traceback`. (Contributed by Jesse Bakker in :issue:`32121`.) diff --git a/Lib/doctest.py b/Lib/doctest.py index 5e5bc21a038..c1d8a1db111 100644 --- a/Lib/doctest.py +++ b/Lib/doctest.py @@ -1611,7 +1611,7 @@ class OutputChecker: '', want) # If a line in got contains only spaces, then remove the # spaces. - got = re.sub(r'(?m)^\s*?$', '', got) + got = re.sub(r'(?m)^[^\S\n]+$', '', got) if got == want: return True diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index ee87446b792..2344d71abf2 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -331,21 +331,21 @@ class ReTests(unittest.TestCase): ['', 'a', '', '', 'c']) for sep, expected in [ - (':*', ['', 'a', 'b', 'c']), - ('(?::*)', ['', 'a', 'b', 'c']), - ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']), - ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']), + (':*', ['', 'a', 'b', 'c', '']), + ('(?::*)', ['', 'a', 'b', 'c', '']), + ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c', '', '']), + ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c', None, '']), ]: - with self.subTest(sep=sep), self.assertWarns(FutureWarning): + with self.subTest(sep=sep): self.assertTypedEqual(re.split(sep, ':a:b::c'), expected) for sep, expected in [ - ('', [':a:b::c']), - (r'\b', [':a:b::c']), - (r'(?=:)', [':a:b::c']), - (r'(?<=:)', [':a:b::c']), + ('', ['', ':', 'a', ':', 'b', ':', ':', 'c', '']), + (r'\b', [':', 'a', ':', 'b', '::', 'c', '']), + (r'(?=:)', ['', ':a', ':b', ':', ':c']), + (r'(?<=:)', [':', 'a:', 'b:', ':', 'c']), ]: - with self.subTest(sep=sep), self.assertRaises(ValueError): + with self.subTest(sep=sep): self.assertTypedEqual(re.split(sep, ':a:b::c'), expected) def test_qualified_re_split(self): @@ -356,9 +356,8 @@ class ReTests(unittest.TestCase): ['', ':', 'a', ':', 'b::c']) self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2), ['', ':', 'a', ':', 'b::c']) - with self.assertWarns(FutureWarning): - self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2), - ['', ':', 'a', ':', 'b::c']) + self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2), + ['', ':', 'a', ':', 'b::c']) def test_re_findall(self): self.assertEqual(re.findall(":+", "abc"), []) @@ -1751,6 +1750,25 @@ class ReTests(unittest.TestCase): "span=(3, 5), match='bb'>" % (type(second).__module__, type(second).__qualname__)) + def test_zerowidth(self): + # Issues 852532, 1647489, 3262, 25054. + self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', '']) + self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', 'bc', '']) + self.assertEqual(re.split(r"(?isbytes = isbytes; state->charsize = charsize; + state->match_all = 0; + state->must_advance = 0; state->beginning = ptr; @@ -559,14 +561,14 @@ pattern_dealloc(PatternObject* self) } LOCAL(Py_ssize_t) -sre_match(SRE_STATE* state, SRE_CODE* pattern, int match_all) +sre_match(SRE_STATE* state, SRE_CODE* pattern) { if (state->charsize == 1) - return sre_ucs1_match(state, pattern, match_all); + return sre_ucs1_match(state, pattern, 1); if (state->charsize == 2) - return sre_ucs2_match(state, pattern, match_all); + return sre_ucs2_match(state, pattern, 1); assert(state->charsize == 4); - return sre_ucs4_match(state, pattern, match_all); + return sre_ucs4_match(state, pattern, 1); } LOCAL(Py_ssize_t) @@ -606,7 +608,7 @@ _sre_SRE_Pattern_match_impl(PatternObject *self, PyObject *string, TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr)); - status = sre_match(&state, PatternObject_GetCode(self), 0); + status = sre_match(&state, PatternObject_GetCode(self)); TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); if (PyErr_Occurred()) { @@ -645,7 +647,8 @@ _sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyObject *string, TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr)); - status = sre_match(&state, PatternObject_GetCode(self), 1); + state.match_all = 1; + status = sre_match(&state, PatternObject_GetCode(self)); TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); if (PyErr_Occurred()) { @@ -808,11 +811,8 @@ _sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string, if (status < 0) goto error; - if (state.ptr == state.start) - state.start = (void*) ((char*) state.ptr + state.charsize); - else - state.start = state.ptr; - + state.must_advance = (state.ptr == state.start); + state.start = state.ptr; } state_fini(&state); @@ -901,17 +901,6 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string, void* last; assert(self->codesize != 0); - if (self->code[0] != SRE_OP_INFO || self->code[3] == 0) { - if (self->code[0] == SRE_OP_INFO && self->code[4] == 0) { - PyErr_SetString(PyExc_ValueError, - "split() requires a non-empty pattern match."); - return NULL; - } - if (PyErr_WarnEx(PyExc_FutureWarning, - "split() requires a non-empty pattern match.", - 1) < 0) - return NULL; - } if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX)) return NULL; @@ -942,14 +931,6 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string, goto error; } - if (state.start == state.ptr) { - if (last == state.end || state.ptr == state.end) - break; - /* skip one character */ - state.start = (void*) ((char*) state.ptr + state.charsize); - continue; - } - /* get segment before this match */ item = getslice(state.isbytes, state.beginning, string, STATE_OFFSET(&state, last), @@ -974,7 +955,7 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string, } n = n + 1; - + state.must_advance = 1; last = state.start = state.ptr; } @@ -1101,9 +1082,7 @@ pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string, if (status < 0) goto error; - } else if (i == b && i == e && n > 0) - /* ignore empty match on latest position */ - goto next; + } if (filter_is_callable) { /* pass match object through filter */ @@ -1130,16 +1109,8 @@ pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string, i = e; n = n + 1; - -next: - /* move on */ - if (state.ptr == state.end) - break; - if (state.ptr == state.start) - state.start = (void*) ((char*) state.ptr + state.charsize); - else - state.start = state.ptr; - + state.must_advance = 1; + state.start = state.ptr; } /* get segment following last match */ @@ -2450,7 +2421,7 @@ _sre_SRE_Scanner_match_impl(ScannerObject *self) state->ptr = state->start; - status = sre_match(state, PatternObject_GetCode(self->pattern), 0); + status = sre_match(state, PatternObject_GetCode(self->pattern)); if (PyErr_Occurred()) return NULL; @@ -2459,12 +2430,10 @@ _sre_SRE_Scanner_match_impl(ScannerObject *self) if (status == 0) state->start = NULL; - else if (state->ptr != state->start) + else { + state->must_advance = (state->ptr == state->start); state->start = state->ptr; - else if (state->ptr != state->end) - state->start = (void*) ((char*) state->ptr + state->charsize); - else - state->start = NULL; + } return match; } @@ -2499,12 +2468,10 @@ _sre_SRE_Scanner_search_impl(ScannerObject *self) if (status == 0) state->start = NULL; - else if (state->ptr != state->start) + else { + state->must_advance = (state->ptr == state->start); state->start = state->ptr; - else if (state->ptr != state->end) - state->start = (void*) ((char*) state->ptr + state->charsize); - else - state->start = NULL; + } return match; } diff --git a/Modules/sre.h b/Modules/sre.h index 585d2841a66..a7284881457 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -67,6 +67,7 @@ typedef struct { void* end; /* end of original string */ /* attributes for the match object */ PyObject* string; + Py_buffer buffer; Py_ssize_t pos, endpos; int isbytes; int charsize; /* character size */ @@ -74,11 +75,12 @@ typedef struct { Py_ssize_t lastindex; Py_ssize_t lastmark; void** mark; + int match_all; + int must_advance; /* dynamically allocated stuff */ char* data_stack; size_t data_stack_size; size_t data_stack_base; - Py_buffer buffer; /* current repeat context */ SRE_REPEAT *repeat; } SRE_STATE; diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index e13b90e8bc0..44948e21ad9 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -199,7 +199,7 @@ SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch) return up != lo && SRE(charset)(state, set, up); } -LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all); +LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel); LOCAL(Py_ssize_t) SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) @@ -510,12 +510,12 @@ do { \ #define JUMP_ASSERT 12 #define JUMP_ASSERT_NOT 13 -#define DO_JUMPX(jumpvalue, jumplabel, nextpattern, matchall) \ +#define DO_JUMPX(jumpvalue, jumplabel, nextpattern, toplevel_) \ DATA_ALLOC(SRE(match_context), nextctx); \ nextctx->last_ctx_pos = ctx_pos; \ nextctx->jump = jumpvalue; \ nextctx->pattern = nextpattern; \ - nextctx->match_all = matchall; \ + nextctx->toplevel = toplevel_; \ ctx_pos = alloc_pos; \ ctx = nextctx; \ goto entrance; \ @@ -523,7 +523,7 @@ do { \ while (0) /* gcc doesn't like labels at end of scopes */ \ #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \ - DO_JUMPX(jumpvalue, jumplabel, nextpattern, ctx->match_all) + DO_JUMPX(jumpvalue, jumplabel, nextpattern, ctx->toplevel) #define DO_JUMP0(jumpvalue, jumplabel, nextpattern) \ DO_JUMPX(jumpvalue, jumplabel, nextpattern, 0) @@ -540,13 +540,13 @@ typedef struct { SRE_CODE chr; SRE_REPEAT* rep; } u; - int match_all; + int toplevel; } SRE(match_context); /* check if string matches the given pattern. returns <0 for error, 0 for failure, and 1 for success */ LOCAL(Py_ssize_t) -SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all) +SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) { SRE_CHAR* end = (SRE_CHAR *)state->end; Py_ssize_t alloc_pos, ctx_pos = -1; @@ -563,7 +563,7 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all) ctx->last_ctx_pos = -1; ctx->jump = JUMP_NONE; ctx->pattern = pattern; - ctx->match_all = match_all; + ctx->toplevel = toplevel; ctx_pos = alloc_pos; entrance: @@ -636,11 +636,14 @@ entrance: case SRE_OP_SUCCESS: /* end of pattern */ TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr)); - if (!ctx->match_all || ctx->ptr == state->end) { - state->ptr = ctx->ptr; - RETURN_SUCCESS; + if (ctx->toplevel && + ((state->match_all && ctx->ptr != state->end) || + (state->must_advance && ctx->ptr == state->start))) + { + RETURN_FAILURE; } - RETURN_FAILURE; + state->ptr = ctx->ptr; + RETURN_SUCCESS; case SRE_OP_AT: /* match at given position */ @@ -856,7 +859,9 @@ entrance: RETURN_FAILURE; if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS && - ctx->ptr == state->end) { + ctx->ptr == state->end && + !(ctx->toplevel && state->must_advance && ctx->ptr == state->start)) + { /* tail is empty. we're finished */ state->ptr = ctx->ptr; RETURN_SUCCESS; @@ -941,7 +946,10 @@ entrance: } if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS && - (!match_all || ctx->ptr == state->end)) { + !(ctx->toplevel && + ((state->match_all && ctx->ptr != state->end) || + (state->must_advance && ctx->ptr == state->start)))) + { /* tail is empty. we're finished */ state->ptr = ctx->ptr; RETURN_SUCCESS; @@ -1417,6 +1425,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) return 0; /* literal can't match: doesn't fit in char width */ #endif end = (SRE_CHAR *)state->end; + state->must_advance = 0; while (ptr < end) { while (*ptr != c) { if (++ptr >= end) @@ -1458,6 +1467,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) return 0; i = 1; + state->must_advance = 0; do { if (*ptr == (SRE_CHAR) prefix[i]) { if (++i != prefix_len) { @@ -1487,6 +1497,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) if (charset) { /* pattern starts with a character from a known set */ end = (SRE_CHAR *)state->end; + state->must_advance = 0; for (;;) { while (ptr < end && !SRE(charset)(state, charset, *ptr)) ptr++; @@ -1503,13 +1514,15 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) } else { /* general case */ assert(ptr <= end); - while (1) { + TRACE(("|%p|%p|SEARCH\n", pattern, ptr)); + state->start = state->ptr = ptr; + status = SRE(match)(state, pattern, 1); + state->must_advance = 0; + while (status == 0 && ptr < end) { + ptr++; TRACE(("|%p|%p|SEARCH\n", pattern, ptr)); state->start = state->ptr = ptr; status = SRE(match)(state, pattern, 0); - if (status != 0 || ptr >= end) - break; - ptr++; } }