bpo-25054, bpo-1647489: Added support of splitting on zerowidth patterns. (#4471)

Also fixed searching patterns that could match an empty string.
2017-12-04 14:29:05 +02:00 · 2017-12-04 14:29:05 +02:00 · 70d56fb525
parent e69fbb6a56
commit 70d56fb525
9 changed files with 128 additions and 117 deletions
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@ -708,37 +708,19 @@ form.
   That way, separator components are always found at the same relative
   indices within the result list.

-   .. note::
+   The pattern can match empty strings. ::

-      :func:`split` doesn't currently split a string on an empty pattern match.
-      For example::
-
-         >>> re.split('x*', 'axbc')
-         ['a', 'bc']
-
-      Even though ``'x*'`` also matches 0 'x' before 'a', between 'b' and 'c',
-      and after 'c', currently these matches are ignored.  The correct behavior
-      (i.e. splitting on empty matches too and returning ``['', 'a', 'b', 'c',
-      '']``) will be implemented in future versions of Python, but since this
-      is a backward incompatible change, a :exc:`FutureWarning` will be raised
-      in the meanwhile.
-
-      Patterns that can only match empty strings currently never split the
-      string.  Since this doesn't match the expected behavior, a
-      :exc:`ValueError` will be raised starting from Python 3.5::
-
-         >>> re.split("^$", "foo\n\nbar\n", flags=re.M)
-         Traceback (most recent call last):
-           File "<stdin>", line 1, in <module>
-           ...
-         ValueError: split() requires a non-empty pattern match.
+      >>> re.split(r'\b', 'Words, words, words.')
+      ['', 'Words', ', ', 'words', ', ', 'words', '.']
+      >>> re.split(r'(\W*)', '...words...')
+      ['', '...', 'w', '', 'o', '', 'r', '', 'd', '', 's', '...', '']

   .. versionchanged:: 3.1
      Added the optional flags argument.

-   .. versionchanged:: 3.5
-      Splitting on a pattern that could match an empty string now raises
-      a warning.  Patterns that can only match empty strings are now rejected.
+   .. versionchanged:: 3.7
+      Added support of splitting on a pattern that could match an empty string.
+

 .. function:: findall(pattern, string, flags=0)

@ -746,8 +728,10 @@ form.
   strings.  The *string* is scanned left-to-right, and matches are returned in
   the order found.  If one or more groups are present in the pattern, return a
   list of groups; this will be a list of tuples if the pattern has more than
-   one group.  Empty matches are included in the result unless they touch the
-   beginning of another match.
+   one group.  Empty matches are included in the result.
+
+   .. versionchanged:: 3.7
+      Non-empty matches can now start just after a previous empty match.


 .. function:: finditer(pattern, string, flags=0)
@ -755,8 +739,10 @@ form.
   Return an :term:`iterator` yielding :ref:`match objects <match-objects>` over
   all non-overlapping matches for the RE *pattern* in *string*.  The *string*
   is scanned left-to-right, and matches are returned in the order found.  Empty
-   matches are included in the result unless they touch the beginning of another
-   match.
+   matches are included in the result.
+
+   .. versionchanged:: 3.7
+      Non-empty matches can now start just after a previous empty match.


 .. function:: sub(pattern, repl, string, count=0, flags=0)
--- a/Doc/whatsnew/3.7.rst
+++ b/Doc/whatsnew/3.7.rst
@ -364,6 +364,10 @@ The flags :const:`re.ASCII`, :const:`re.LOCALE` and :const:`re.UNICODE`
 can be set within the scope of a group.
 (Contributed by Serhiy Storchaka in :issue:`31690`.)

+:func:`re.split` now supports splitting on a pattern like ``r'\b'``,
+``'^$'`` or ``(?=-)`` that matches an empty string.
+(Contributed by Serhiy Storchaka in :issue:`25054`.)
+
 string
 ------

@ -768,6 +772,23 @@ Changes in the Python API
  avoid a warning escape them with a backslash.
  (Contributed by Serhiy Storchaka in :issue:`30349`.)

+* The result of splitting a string on a :mod:`regular expression <re>`
+  that could match an empty string has been changed.  For example
+  splitting on ``r'\s*'`` will now split not only on whitespaces as it
+  did previously, but also between any pair of non-whitespace
+  characters.  The previous behavior can be restored by changing the pattern
+  to ``r'\s+'``.  A :exc:`FutureWarning` was emitted for such patterns since
+  Python 3.5.
+
+  For patterns that match both empty and non-empty strings, the result of
+  searching for all matches may also be changed in other cases.  For example
+  in the string ``'a\n\n'``, the pattern ``r'(?m)^\s*?$'`` will not only
+  match empty strings at positions 2 and 3, but also the string ``'\n'`` at
+  positions 2--3.  To match only blank lines, the pattern should be rewritten
+  as ``r'(?m)^[^\S\n]*$'``.
+
+  (Contributed by Serhiy Storchaka in :issue:`25054`.)
+
 * :class:`tracemalloc.Traceback` frames are now sorted from oldest to most
  recent to be more consistent with :mod:`traceback`.
  (Contributed by Jesse Bakker in :issue:`32121`.)
--- a/Lib/doctest.py
+++ b/Lib/doctest.py
@ -1611,7 +1611,7 @@ class OutputChecker:
                          '', want)
            # If a line in got contains only spaces, then remove the
            # spaces.
-            got = re.sub(r'(?m)^\s*?$', '', got)
+            got = re.sub(r'(?m)^[^\S\n]+$', '', got)
            if got == want:
                return True

--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@ -331,21 +331,21 @@ class ReTests(unittest.TestCase):
                         ['', 'a', '', '', 'c'])

        for sep, expected in [
-            (':*', ['', 'a', 'b', 'c']),
-            ('(?::*)', ['', 'a', 'b', 'c']),
-            ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']),
-            ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']),
+            (':*', ['', 'a', 'b', 'c', '']),
+            ('(?::*)', ['', 'a', 'b', 'c', '']),
+            ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c', '', '']),
+            ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c', None, '']),
        ]:
-            with self.subTest(sep=sep), self.assertWarns(FutureWarning):
+            with self.subTest(sep=sep):
                self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)

        for sep, expected in [
-            ('', [':a:b::c']),
-            (r'\b', [':a:b::c']),
-            (r'(?=:)', [':a:b::c']),
-            (r'(?<=:)', [':a:b::c']),
+            ('', ['', ':', 'a', ':', 'b', ':', ':', 'c', '']),
+            (r'\b', [':', 'a', ':', 'b', '::', 'c', '']),
+            (r'(?=:)', ['', ':a', ':b', ':', ':c']),
+            (r'(?<=:)', [':', 'a:', 'b:', ':', 'c']),
        ]:
-            with self.subTest(sep=sep), self.assertRaises(ValueError):
+            with self.subTest(sep=sep):
                self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)

    def test_qualified_re_split(self):
@ -356,7 +356,6 @@ class ReTests(unittest.TestCase):
                         ['', ':', 'a', ':', 'b::c'])
        self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
                         ['', ':', 'a', ':', 'b::c'])
-        with self.assertWarns(FutureWarning):
        self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
                         ['', ':', 'a', ':', 'b::c'])

@ -1751,6 +1750,25 @@ class ReTests(unittest.TestCase):
                         "span=(3, 5), match='bb'>" %
                         (type(second).__module__, type(second).__qualname__))

+    def test_zerowidth(self):
+        # Issues 852532, 1647489, 3262, 25054.
+        self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', ''])
+        self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', 'bc', ''])
+        self.assertEqual(re.split(r"(?<!\w)(?=\w)|:+", "a::bc"), ['', 'a', 'bc'])
+        self.assertEqual(re.split(r"(?<=\w)(?!\w)|:+", "a::bc"), ['a', '', 'bc', ''])
+
+        self.assertEqual(re.sub(r"\b", "-", "a::bc"), '-a-::-bc-')
+        self.assertEqual(re.sub(r"\b|:+", "-", "a::bc"), '-a--bc-')
+        self.assertEqual(re.sub(r"(\b|:+)", r"[\1]", "a::bc"), '[]a[][::]bc[]')
+
+        self.assertEqual(re.findall(r"\b|:+", "a::bc"), ['', '', '::', '', ''])
+        self.assertEqual(re.findall(r"\b|\w+", "a::bc"),
+                         ['', 'a', '', '', 'bc', ''])
+
+        self.assertEqual([m.span() for m in re.finditer(r"\b|:+", "a::bc")],
+                         [(0, 0), (1, 1), (1, 3), (3, 3), (5, 5)])
+        self.assertEqual([m.span() for m in re.finditer(r"\b|\w+", "a::bc")],
+                         [(0, 0), (0, 1), (1, 1), (3, 3), (3, 5), (5, 5)])

    def test_bug_2537(self):
        # issue 2537: empty submatches
--- a/Misc/NEWS.d/next/Library/2017-11-20-01-01-01.bpo-25054.rOlRV6.rst
+++ b/Misc/NEWS.d/next/Library/2017-11-20-01-01-01.bpo-25054.rOlRV6.rst
@ -0,0 +1 @@
+Added support of splitting on a pattern that could match an empty string.
--- a/Misc/NEWS.d/next/Library/2017-11-20-01-29-46.bpo-1647489.-ZNNkh.rst
+++ b/Misc/NEWS.d/next/Library/2017-11-20-01-29-46.bpo-1647489.-ZNNkh.rst
@ -0,0 +1,3 @@
+Fixed searching regular expression patterns that could match an empty
+string. Non-empty string can now be correctly found after matching an empty
+string.
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@ -446,6 +446,8 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,

    state->isbytes = isbytes;
    state->charsize = charsize;
+    state->match_all = 0;
+    state->must_advance = 0;

    state->beginning = ptr;

@ -559,14 +561,14 @@ pattern_dealloc(PatternObject* self)
 }

 LOCAL(Py_ssize_t)
-sre_match(SRE_STATE* state, SRE_CODE* pattern, int match_all)
+sre_match(SRE_STATE* state, SRE_CODE* pattern)
 {
    if (state->charsize == 1)
-        return sre_ucs1_match(state, pattern, match_all);
+        return sre_ucs1_match(state, pattern, 1);
    if (state->charsize == 2)
-        return sre_ucs2_match(state, pattern, match_all);
+        return sre_ucs2_match(state, pattern, 1);
    assert(state->charsize == 4);
-    return sre_ucs4_match(state, pattern, match_all);
+    return sre_ucs4_match(state, pattern, 1);
 }

 LOCAL(Py_ssize_t)
@ -606,7 +608,7 @@ _sre_SRE_Pattern_match_impl(PatternObject *self, PyObject *string,

    TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));

-    status = sre_match(&state, PatternObject_GetCode(self), 0);
+    status = sre_match(&state, PatternObject_GetCode(self));

    TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
    if (PyErr_Occurred()) {
@ -645,7 +647,8 @@ _sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyObject *string,

    TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));

-    status = sre_match(&state, PatternObject_GetCode(self), 1);
+    state.match_all = 1;
+    status = sre_match(&state, PatternObject_GetCode(self));

    TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
    if (PyErr_Occurred()) {
@ -808,11 +811,8 @@ _sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string,
        if (status < 0)
            goto error;

-        if (state.ptr == state.start)
-            state.start = (void*) ((char*) state.ptr + state.charsize);
-        else
+        state.must_advance = (state.ptr == state.start);
        state.start = state.ptr;
-
    }

    state_fini(&state);
@ -901,17 +901,6 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
    void* last;

    assert(self->codesize != 0);
-    if (self->code[0] != SRE_OP_INFO || self->code[3] == 0) {
-        if (self->code[0] == SRE_OP_INFO && self->code[4] == 0) {
-            PyErr_SetString(PyExc_ValueError,
-                            "split() requires a non-empty pattern match.");
-            return NULL;
-        }
-        if (PyErr_WarnEx(PyExc_FutureWarning,
-                         "split() requires a non-empty pattern match.",
-                         1) < 0)
-            return NULL;
-    }

    if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX))
        return NULL;
@ -942,14 +931,6 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
            goto error;
        }

-        if (state.start == state.ptr) {
-            if (last == state.end || state.ptr == state.end)
-                break;
-            /* skip one character */
-            state.start = (void*) ((char*) state.ptr + state.charsize);
-            continue;
-        }
-
        /* get segment before this match */
        item = getslice(state.isbytes, state.beginning,
            string, STATE_OFFSET(&state, last),
@ -974,7 +955,7 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
        }

        n = n + 1;
-
+        state.must_advance = 1;
        last = state.start = state.ptr;

    }
@ -1101,9 +1082,7 @@ pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
            if (status < 0)
                goto error;

-        } else if (i == b && i == e && n > 0)
-            /* ignore empty match on latest position */
-            goto next;
+        }

        if (filter_is_callable) {
            /* pass match object through filter */
@ -1130,16 +1109,8 @@ pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,

        i = e;
        n = n + 1;
-
-next:
-        /* move on */
-        if (state.ptr == state.end)
-            break;
-        if (state.ptr == state.start)
-            state.start = (void*) ((char*) state.ptr + state.charsize);
-        else
+        state.must_advance = 1;
        state.start = state.ptr;
-
    }

    /* get segment following last match */
@ -2450,7 +2421,7 @@ _sre_SRE_Scanner_match_impl(ScannerObject *self)

    state->ptr = state->start;

-    status = sre_match(state, PatternObject_GetCode(self->pattern), 0);
+    status = sre_match(state, PatternObject_GetCode(self->pattern));
    if (PyErr_Occurred())
        return NULL;

@ -2459,12 +2430,10 @@ _sre_SRE_Scanner_match_impl(ScannerObject *self)

    if (status == 0)
        state->start = NULL;
-    else if (state->ptr != state->start)
+    else {
+        state->must_advance = (state->ptr == state->start);
        state->start = state->ptr;
-    else if (state->ptr != state->end)
-        state->start = (void*) ((char*) state->ptr + state->charsize);
-    else
-        state->start = NULL;
+    }

    return match;
 }
@ -2499,12 +2468,10 @@ _sre_SRE_Scanner_search_impl(ScannerObject *self)

    if (status == 0)
        state->start = NULL;
-    else if (state->ptr != state->start)
+    else {
+        state->must_advance = (state->ptr == state->start);
        state->start = state->ptr;
-    else if (state->ptr != state->end)
-        state->start = (void*) ((char*) state->ptr + state->charsize);
-    else
-        state->start = NULL;
+    }

    return match;
 }
--- a/Modules/sre.h
+++ b/Modules/sre.h
@ -67,6 +67,7 @@ typedef struct {
    void* end; /* end of original string */
    /* attributes for the match object */
    PyObject* string;
+    Py_buffer buffer;
    Py_ssize_t pos, endpos;
    int isbytes;
    int charsize; /* character size */
@ -74,11 +75,12 @@ typedef struct {
    Py_ssize_t lastindex;
    Py_ssize_t lastmark;
    void** mark;
+    int match_all;
+    int must_advance;
    /* dynamically allocated stuff */
    char* data_stack;
    size_t data_stack_size;
    size_t data_stack_base;
-    Py_buffer buffer;
    /* current repeat context */
    SRE_REPEAT *repeat;
 } SRE_STATE;
--- a/Modules/sre_lib.h
+++ b/Modules/sre_lib.h
@ -199,7 +199,7 @@ SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
    return up != lo && SRE(charset)(state, set, up);
 }

-LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all);
+LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel);

 LOCAL(Py_ssize_t)
 SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
@ -510,12 +510,12 @@ do { \
 #define JUMP_ASSERT          12
 #define JUMP_ASSERT_NOT      13

-#define DO_JUMPX(jumpvalue, jumplabel, nextpattern, matchall) \
+#define DO_JUMPX(jumpvalue, jumplabel, nextpattern, toplevel_) \
    DATA_ALLOC(SRE(match_context), nextctx); \
    nextctx->last_ctx_pos = ctx_pos; \
    nextctx->jump = jumpvalue; \
    nextctx->pattern = nextpattern; \
-    nextctx->match_all = matchall; \
+    nextctx->toplevel = toplevel_; \
    ctx_pos = alloc_pos; \
    ctx = nextctx; \
    goto entrance; \
@ -523,7 +523,7 @@ do { \
    while (0) /* gcc doesn't like labels at end of scopes */ \

 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
-    DO_JUMPX(jumpvalue, jumplabel, nextpattern, ctx->match_all)
+    DO_JUMPX(jumpvalue, jumplabel, nextpattern, ctx->toplevel)

 #define DO_JUMP0(jumpvalue, jumplabel, nextpattern) \
    DO_JUMPX(jumpvalue, jumplabel, nextpattern, 0)
@ -540,13 +540,13 @@ typedef struct {
        SRE_CODE chr;
        SRE_REPEAT* rep;
    } u;
-    int match_all;
+    int toplevel;
 } SRE(match_context);

 /* check if string matches the given pattern.  returns <0 for
   error, 0 for failure, and 1 for success */
 LOCAL(Py_ssize_t)
-SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all)
+SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel)
 {
    SRE_CHAR* end = (SRE_CHAR *)state->end;
    Py_ssize_t alloc_pos, ctx_pos = -1;
@ -563,7 +563,7 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all)
    ctx->last_ctx_pos = -1;
    ctx->jump = JUMP_NONE;
    ctx->pattern = pattern;
-    ctx->match_all = match_all;
+    ctx->toplevel = toplevel;
    ctx_pos = alloc_pos;

 entrance:
@ -636,11 +636,14 @@ entrance:
        case SRE_OP_SUCCESS:
            /* end of pattern */
            TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
-            if (!ctx->match_all || ctx->ptr == state->end) {
+            if (ctx->toplevel &&
+                ((state->match_all && ctx->ptr != state->end) ||
+                 (state->must_advance && ctx->ptr == state->start)))
+            {
+                RETURN_FAILURE;
+            }
            state->ptr = ctx->ptr;
            RETURN_SUCCESS;
-            }
-            RETURN_FAILURE;

        case SRE_OP_AT:
            /* match at given position */
@ -856,7 +859,9 @@ entrance:
                RETURN_FAILURE;

            if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS &&
-                ctx->ptr == state->end) {
+                ctx->ptr == state->end &&
+                !(ctx->toplevel && state->must_advance && ctx->ptr == state->start))
+            {
                /* tail is empty.  we're finished */
                state->ptr = ctx->ptr;
                RETURN_SUCCESS;
@ -941,7 +946,10 @@ entrance:
            }

            if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS &&
-                (!match_all || ctx->ptr == state->end)) {
+                !(ctx->toplevel &&
+                  ((state->match_all && ctx->ptr != state->end) ||
+                   (state->must_advance && ctx->ptr == state->start))))
+            {
                /* tail is empty.  we're finished */
                state->ptr = ctx->ptr;
                RETURN_SUCCESS;
@ -1417,6 +1425,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
            return 0; /* literal can't match: doesn't fit in char width */
 #endif
        end = (SRE_CHAR *)state->end;
+        state->must_advance = 0;
        while (ptr < end) {
            while (*ptr != c) {
                if (++ptr >= end)
@ -1458,6 +1467,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
                return 0;

            i = 1;
+            state->must_advance = 0;
            do {
                if (*ptr == (SRE_CHAR) prefix[i]) {
                    if (++i != prefix_len) {
@ -1487,6 +1497,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
    if (charset) {
        /* pattern starts with a character from a known set */
        end = (SRE_CHAR *)state->end;
+        state->must_advance = 0;
        for (;;) {
            while (ptr < end && !SRE(charset)(state, charset, *ptr))
                ptr++;
@ -1503,13 +1514,15 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
    } else {
        /* general case */
        assert(ptr <= end);
-        while (1) {
+        TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
+        state->start = state->ptr = ptr;
+        status = SRE(match)(state, pattern, 1);
+        state->must_advance = 0;
+        while (status == 0 && ptr < end) {
+            ptr++;
            TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
            state->start = state->ptr = ptr;
            status = SRE(match)(state, pattern, 0);
-            if (status != 0 || ptr >= end)
-                break;
-            ptr++;
        }
    }
				`@ -0,0 +1 @@`
				`Added support of splitting on a pattern that could match an empty string.`