From 83e802796c80f46be616b48020356f7f51be533d Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 3 Feb 2015 11:04:19 +0200 Subject: [PATCH] Issue #22818: Splitting on a pattern that could match an empty string now raises a warning. Patterns that can only match empty strings are now rejected. --- Doc/library/re.rst | 32 ++++++++++++++++++++++++++------ Doc/whatsnew/3.5.rst | 7 +++++++ Lib/sre_compile.py | 10 +++++----- Lib/test/test_re.py | 39 ++++++++++++++++++++++++++++++--------- Misc/NEWS | 4 ++++ Modules/_sre.c | 13 +++++++++++++ 6 files changed, 85 insertions(+), 20 deletions(-) diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 60ded8b2f4a..8e204960114 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -626,17 +626,37 @@ form. That way, separator components are always found at the same relative indices within the result list. - Note that *split* will never split a string on an empty pattern match. - For example: + .. note:: - >>> re.split('x*', 'foo') - ['foo'] - >>> re.split("(?m)^$", "foo\n\nbar\n") - ['foo\n\nbar\n'] + :func:`split` doesn't currently split a string on an empty pattern match. + For example: + + >>> re.split('x*', 'axbc') + ['a', 'bc'] + + Even though ``'x*'`` also matches 0 'x' before 'a', between 'b' and 'c', + and after 'c', currently these matches are ignored. The correct behavior + (i.e. splitting on empty matches too and returning ``['', 'a', 'b', 'c', + '']``) will be implemented in future versions of Python, but since this + is a backward incompatible change, a :exc:`FutureWarning` will be raised + in the meanwhile. + + Patterns that can only match empty strings currently never split the + string. Since this doesn't match the expected behavior, a + :exc:`ValueError` will be raised starting from Python 3.5:: + + >>> re.split("^$", "foo\n\nbar\n", flags=re.M) + Traceback (most recent call last): + File "", line 1, in + ... + ValueError: split() requires a non-empty pattern match. .. versionchanged:: 3.1 Added the optional flags argument. + .. versionchanged:: 3.5 + Splitting on a pattern that could match an empty string now raises + a warning. Patterns that can only match empty strings are now rejected. .. function:: findall(pattern, string, flags=0) diff --git a/Doc/whatsnew/3.5.rst b/Doc/whatsnew/3.5.rst index c309aa80c7c..f7b9a83c2ec 100644 --- a/Doc/whatsnew/3.5.rst +++ b/Doc/whatsnew/3.5.rst @@ -482,6 +482,13 @@ Changes in the Python API simply define :meth:`~importlib.machinery.Loader.create_module` to return ``None`` (:issue:`23014`). +* :func:`re.split` always ignored empty pattern matches, so the ``'x*'`` + pattern worked the same as ``'x+'``, and the ``'\b'`` pattern never worked. + Now :func:`re.split` raises a warning if the pattern could match + an empty string. For compatibility use patterns that never match an empty + string (e.g. ``'x+'`` instead of ``'x*'``). Patterns that could only match + an empty string (such as ``'\b'``) now raise an error. + Changes in the C API -------------------- diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 1241a01c3ea..30a5fae4851 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -414,8 +414,11 @@ def _compile_info(code, pattern, flags): # this contains min/max pattern width, and an optional literal # prefix or a character map lo, hi = pattern.getwidth() + if hi > MAXCODE: + hi = MAXCODE if lo == 0: - return # not worth it + code.extend([INFO, 4, 0, lo, hi]) + return # look for a literal prefix prefix = [] prefixappend = prefix.append @@ -495,10 +498,7 @@ def _compile_info(code, pattern, flags): else: emit(MAXCODE) prefix = prefix[:MAXCODE] - if hi < MAXCODE: - emit(hi) - else: - emit(0) + emit(min(hi, MAXCODE)) # add literal prefix if prefix: emit(len(prefix)) # length diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 6e90b2fec9a..2fb4764abf3 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -251,28 +251,28 @@ class ReTests(unittest.TestCase): for string in ":a:b::c", S(":a:b::c"): self.assertTypedEqual(re.split(":", string), ['', 'a', 'b', '', 'c']) - self.assertTypedEqual(re.split(":*", string), + self.assertTypedEqual(re.split(":+", string), ['', 'a', 'b', 'c']) - self.assertTypedEqual(re.split("(:*)", string), + self.assertTypedEqual(re.split("(:+)", string), ['', ':', 'a', ':', 'b', '::', 'c']) for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"), memoryview(b":a:b::c")): self.assertTypedEqual(re.split(b":", string), [b'', b'a', b'b', b'', b'c']) - self.assertTypedEqual(re.split(b":*", string), + self.assertTypedEqual(re.split(b":+", string), [b'', b'a', b'b', b'c']) - self.assertTypedEqual(re.split(b"(:*)", string), + self.assertTypedEqual(re.split(b"(:+)", string), [b'', b':', b'a', b':', b'b', b'::', b'c']) for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432", "\U0001d49c\U0001d49e\U0001d4b5"): string = ":%s:%s::%s" % (a, b, c) self.assertEqual(re.split(":", string), ['', a, b, '', c]) - self.assertEqual(re.split(":*", string), ['', a, b, c]) - self.assertEqual(re.split("(:*)", string), + self.assertEqual(re.split(":+", string), ['', a, b, c]) + self.assertEqual(re.split("(:+)", string), ['', ':', a, ':', b, '::', c]) - self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c']) - self.assertEqual(re.split("(:)*", ":a:b::c"), + self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c']) + self.assertEqual(re.split("(:)+", ":a:b::c"), ['', ':', 'a', ':', 'b', ':', 'c']) self.assertEqual(re.split("([b:]+)", ":a:b::c"), ['', ':', 'a', ':b::', 'c']) @@ -282,13 +282,34 @@ class ReTests(unittest.TestCase): self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"), ['', 'a', '', '', 'c']) + for sep, expected in [ + (':*', ['', 'a', 'b', 'c']), + ('(?::*)', ['', 'a', 'b', 'c']), + ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']), + ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']), + ]: + with self.subTest(sep=sep), self.assertWarns(FutureWarning): + self.assertTypedEqual(re.split(sep, ':a:b::c'), expected) + + for sep, expected in [ + ('', [':a:b::c']), + (r'\b', [':a:b::c']), + (r'(?=:)', [':a:b::c']), + (r'(?<=:)', [':a:b::c']), + ]: + with self.subTest(sep=sep), self.assertRaises(ValueError): + self.assertTypedEqual(re.split(sep, ':a:b::c'), expected) + def test_qualified_re_split(self): self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c']) self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d']) self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2), ['', ':', 'a', ':', 'b::c']) - self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2), + self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2), ['', ':', 'a', ':', 'b::c']) + with self.assertWarns(FutureWarning): + self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2), + ['', ':', 'a', ':', 'b::c']) def test_re_findall(self): self.assertEqual(re.findall(":+", "abc"), []) diff --git a/Misc/NEWS b/Misc/NEWS index 7a3a5aea005..a6cc4b57ace 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -232,6 +232,10 @@ Core and Builtins Library ------- +- Issue #22818: Splitting on a pattern that could match an empty string now + raises a warning. Patterns that can only match empty strings are now + rejected. + - Issue #23099: Closing io.BytesIO with exported buffer is rejected now to prevent corrupting exported buffer. diff --git a/Modules/_sre.c b/Modules/_sre.c index 63778f4e6bf..9550d97c1bd 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -863,6 +863,19 @@ pattern_split(PatternObject* self, PyObject* args, PyObject* kw) if (!string) return NULL; + assert(self->codesize != 0); + if (self->code[0] != SRE_OP_INFO || self->code[3] == 0) { + if (self->code[0] == SRE_OP_INFO && self->code[4] == 0) { + PyErr_SetString(PyExc_ValueError, + "split() requires a non-empty pattern match."); + return NULL; + } + if (PyErr_WarnEx(PyExc_FutureWarning, + "split() requires a non-empty pattern match.", + 1) < 0) + return NULL; + } + string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX); if (!string) return NULL;