Issue #22818: Splitting on a pattern that could match an empty string now
raises a warning. Patterns that can only match empty strings are now rejected.
This commit is contained in:
parent
32ca3dcb97
commit
83e802796c
|
@ -626,17 +626,37 @@ form.
|
|||
That way, separator components are always found at the same relative
|
||||
indices within the result list.
|
||||
|
||||
Note that *split* will never split a string on an empty pattern match.
|
||||
For example:
|
||||
.. note::
|
||||
|
||||
>>> re.split('x*', 'foo')
|
||||
['foo']
|
||||
>>> re.split("(?m)^$", "foo\n\nbar\n")
|
||||
['foo\n\nbar\n']
|
||||
:func:`split` doesn't currently split a string on an empty pattern match.
|
||||
For example:
|
||||
|
||||
>>> re.split('x*', 'axbc')
|
||||
['a', 'bc']
|
||||
|
||||
Even though ``'x*'`` also matches 0 'x' before 'a', between 'b' and 'c',
|
||||
and after 'c', currently these matches are ignored. The correct behavior
|
||||
(i.e. splitting on empty matches too and returning ``['', 'a', 'b', 'c',
|
||||
'']``) will be implemented in future versions of Python, but since this
|
||||
is a backward incompatible change, a :exc:`FutureWarning` will be raised
|
||||
in the meanwhile.
|
||||
|
||||
Patterns that can only match empty strings currently never split the
|
||||
string. Since this doesn't match the expected behavior, a
|
||||
:exc:`ValueError` will be raised starting from Python 3.5::
|
||||
|
||||
>>> re.split("^$", "foo\n\nbar\n", flags=re.M)
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in <module>
|
||||
...
|
||||
ValueError: split() requires a non-empty pattern match.
|
||||
|
||||
.. versionchanged:: 3.1
|
||||
Added the optional flags argument.
|
||||
|
||||
.. versionchanged:: 3.5
|
||||
Splitting on a pattern that could match an empty string now raises
|
||||
a warning. Patterns that can only match empty strings are now rejected.
|
||||
|
||||
.. function:: findall(pattern, string, flags=0)
|
||||
|
||||
|
|
|
@ -482,6 +482,13 @@ Changes in the Python API
|
|||
simply define :meth:`~importlib.machinery.Loader.create_module` to return
|
||||
``None`` (:issue:`23014`).
|
||||
|
||||
* :func:`re.split` always ignored empty pattern matches, so the ``'x*'``
|
||||
pattern worked the same as ``'x+'``, and the ``'\b'`` pattern never worked.
|
||||
Now :func:`re.split` raises a warning if the pattern could match
|
||||
an empty string. For compatibility use patterns that never match an empty
|
||||
string (e.g. ``'x+'`` instead of ``'x*'``). Patterns that could only match
|
||||
an empty string (such as ``'\b'``) now raise an error.
|
||||
|
||||
Changes in the C API
|
||||
--------------------
|
||||
|
||||
|
|
|
@ -414,8 +414,11 @@ def _compile_info(code, pattern, flags):
|
|||
# this contains min/max pattern width, and an optional literal
|
||||
# prefix or a character map
|
||||
lo, hi = pattern.getwidth()
|
||||
if hi > MAXCODE:
|
||||
hi = MAXCODE
|
||||
if lo == 0:
|
||||
return # not worth it
|
||||
code.extend([INFO, 4, 0, lo, hi])
|
||||
return
|
||||
# look for a literal prefix
|
||||
prefix = []
|
||||
prefixappend = prefix.append
|
||||
|
@ -495,10 +498,7 @@ def _compile_info(code, pattern, flags):
|
|||
else:
|
||||
emit(MAXCODE)
|
||||
prefix = prefix[:MAXCODE]
|
||||
if hi < MAXCODE:
|
||||
emit(hi)
|
||||
else:
|
||||
emit(0)
|
||||
emit(min(hi, MAXCODE))
|
||||
# add literal prefix
|
||||
if prefix:
|
||||
emit(len(prefix)) # length
|
||||
|
|
|
@ -251,28 +251,28 @@ class ReTests(unittest.TestCase):
|
|||
for string in ":a:b::c", S(":a:b::c"):
|
||||
self.assertTypedEqual(re.split(":", string),
|
||||
['', 'a', 'b', '', 'c'])
|
||||
self.assertTypedEqual(re.split(":*", string),
|
||||
self.assertTypedEqual(re.split(":+", string),
|
||||
['', 'a', 'b', 'c'])
|
||||
self.assertTypedEqual(re.split("(:*)", string),
|
||||
self.assertTypedEqual(re.split("(:+)", string),
|
||||
['', ':', 'a', ':', 'b', '::', 'c'])
|
||||
for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
|
||||
memoryview(b":a:b::c")):
|
||||
self.assertTypedEqual(re.split(b":", string),
|
||||
[b'', b'a', b'b', b'', b'c'])
|
||||
self.assertTypedEqual(re.split(b":*", string),
|
||||
self.assertTypedEqual(re.split(b":+", string),
|
||||
[b'', b'a', b'b', b'c'])
|
||||
self.assertTypedEqual(re.split(b"(:*)", string),
|
||||
self.assertTypedEqual(re.split(b"(:+)", string),
|
||||
[b'', b':', b'a', b':', b'b', b'::', b'c'])
|
||||
for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
|
||||
"\U0001d49c\U0001d49e\U0001d4b5"):
|
||||
string = ":%s:%s::%s" % (a, b, c)
|
||||
self.assertEqual(re.split(":", string), ['', a, b, '', c])
|
||||
self.assertEqual(re.split(":*", string), ['', a, b, c])
|
||||
self.assertEqual(re.split("(:*)", string),
|
||||
self.assertEqual(re.split(":+", string), ['', a, b, c])
|
||||
self.assertEqual(re.split("(:+)", string),
|
||||
['', ':', a, ':', b, '::', c])
|
||||
|
||||
self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
|
||||
self.assertEqual(re.split("(:)*", ":a:b::c"),
|
||||
self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
|
||||
self.assertEqual(re.split("(:)+", ":a:b::c"),
|
||||
['', ':', 'a', ':', 'b', ':', 'c'])
|
||||
self.assertEqual(re.split("([b:]+)", ":a:b::c"),
|
||||
['', ':', 'a', ':b::', 'c'])
|
||||
|
@ -282,13 +282,34 @@ class ReTests(unittest.TestCase):
|
|||
self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
|
||||
['', 'a', '', '', 'c'])
|
||||
|
||||
for sep, expected in [
|
||||
(':*', ['', 'a', 'b', 'c']),
|
||||
('(?::*)', ['', 'a', 'b', 'c']),
|
||||
('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']),
|
||||
('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']),
|
||||
]:
|
||||
with self.subTest(sep=sep), self.assertWarns(FutureWarning):
|
||||
self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
|
||||
|
||||
for sep, expected in [
|
||||
('', [':a:b::c']),
|
||||
(r'\b', [':a:b::c']),
|
||||
(r'(?=:)', [':a:b::c']),
|
||||
(r'(?<=:)', [':a:b::c']),
|
||||
]:
|
||||
with self.subTest(sep=sep), self.assertRaises(ValueError):
|
||||
self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
|
||||
|
||||
def test_qualified_re_split(self):
|
||||
self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
|
||||
self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
|
||||
self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
|
||||
['', ':', 'a', ':', 'b::c'])
|
||||
self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
|
||||
self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
|
||||
['', ':', 'a', ':', 'b::c'])
|
||||
with self.assertWarns(FutureWarning):
|
||||
self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
|
||||
['', ':', 'a', ':', 'b::c'])
|
||||
|
||||
def test_re_findall(self):
|
||||
self.assertEqual(re.findall(":+", "abc"), [])
|
||||
|
|
|
@ -232,6 +232,10 @@ Core and Builtins
|
|||
Library
|
||||
-------
|
||||
|
||||
- Issue #22818: Splitting on a pattern that could match an empty string now
|
||||
raises a warning. Patterns that can only match empty strings are now
|
||||
rejected.
|
||||
|
||||
- Issue #23099: Closing io.BytesIO with exported buffer is rejected now to
|
||||
prevent corrupting exported buffer.
|
||||
|
||||
|
|
|
@ -863,6 +863,19 @@ pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
|
|||
if (!string)
|
||||
return NULL;
|
||||
|
||||
assert(self->codesize != 0);
|
||||
if (self->code[0] != SRE_OP_INFO || self->code[3] == 0) {
|
||||
if (self->code[0] == SRE_OP_INFO && self->code[4] == 0) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"split() requires a non-empty pattern match.");
|
||||
return NULL;
|
||||
}
|
||||
if (PyErr_WarnEx(PyExc_FutureWarning,
|
||||
"split() requires a non-empty pattern match.",
|
||||
1) < 0)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
|
||||
if (!string)
|
||||
return NULL;
|
||||
|
|
Loading…
Reference in New Issue