bpo-30349: Raise FutureWarning for nested sets and set operations (#1553)
in regular expressions.
This commit is contained in:
parent
3daaafb700
commit
05cb728d68
|
@ -200,6 +200,20 @@ The special characters are:
|
||||||
place it at the beginning of the set. For example, both ``[()[\]{}]`` and
|
place it at the beginning of the set. For example, both ``[()[\]{}]`` and
|
||||||
``[]()[{}]`` will both match a parenthesis.
|
``[]()[{}]`` will both match a parenthesis.
|
||||||
|
|
||||||
|
* Support of nested sets and set operations as in `Unicode Technical
|
||||||
|
Standard #18`_ might be added in the future. This would change the
|
||||||
|
syntax, so to facilitate this change a :exc:`FutureWarning` will be raised
|
||||||
|
in ambiguous cases for the time being.
|
||||||
|
That include sets starting with a literal ``'['`` or containing literal
|
||||||
|
character sequences ``'--'``, ``'&&'``, ``'~~'``, and ``'||'``. To
|
||||||
|
avoid a warning escape them with a backslash.
|
||||||
|
|
||||||
|
.. _Unicode Technical Standard #18: https://unicode.org/reports/tr18/
|
||||||
|
|
||||||
|
.. versionchanged:: 3.7
|
||||||
|
:exc:`FutureWarning` is raised if a character set contains constructs
|
||||||
|
that will change semantically in the future.
|
||||||
|
|
||||||
``|``
|
``|``
|
||||||
``A|B``, where *A* and *B* can be arbitrary REs, creates a regular expression that
|
``A|B``, where *A* and *B* can be arbitrary REs, creates a regular expression that
|
||||||
will match either *A* or *B*. An arbitrary number of REs can be separated by the
|
will match either *A* or *B*. An arbitrary number of REs can be separated by the
|
||||||
|
@ -829,7 +843,7 @@ form.
|
||||||
|
|
||||||
>>> legal_chars = string.ascii_lowercase + string.digits + "!#$%&'*+-.^_`|~:"
|
>>> legal_chars = string.ascii_lowercase + string.digits + "!#$%&'*+-.^_`|~:"
|
||||||
>>> print('[%s]+' % re.escape(legal_chars))
|
>>> print('[%s]+' % re.escape(legal_chars))
|
||||||
[abcdefghijklmnopqrstuvwxyz0123456789!\#\$%&'\*\+\-\.\^_`\|~:]+
|
[abcdefghijklmnopqrstuvwxyz0123456789!\#\$%\&'\*\+\-\.\^_`\|\~:]+
|
||||||
|
|
||||||
>>> operators = ['+', '-', '*', '/', '**']
|
>>> operators = ['+', '-', '*', '/', '**']
|
||||||
>>> print('|'.join(map(re.escape, sorted(operators, reverse=True))))
|
>>> print('|'.join(map(re.escape, sorted(operators, reverse=True))))
|
||||||
|
|
|
@ -300,7 +300,7 @@ whatsnew/3.2,,:gz,">>> with tarfile.open(name='myarchive.tar.gz', mode='w:gz') a
|
||||||
whatsnew/3.2,,:location,zope9-location = ${zope9:location}
|
whatsnew/3.2,,:location,zope9-location = ${zope9:location}
|
||||||
whatsnew/3.2,,:prefix,zope-conf = ${custom:prefix}/etc/zope.conf
|
whatsnew/3.2,,:prefix,zope-conf = ${custom:prefix}/etc/zope.conf
|
||||||
library/re,,`,!#$%&'*+-.^_`|~:
|
library/re,,`,!#$%&'*+-.^_`|~:
|
||||||
library/re,,`,!\#\$%&'\*\+\-\.\^_`\|~:
|
library/re,,`,!\#\$%\&'\*\+\-\.\^_`\|\~:
|
||||||
library/tarfile,,:xz,'x:xz'
|
library/tarfile,,:xz,'x:xz'
|
||||||
library/xml.etree.elementtree,,:sometag,prefix:sometag
|
library/xml.etree.elementtree,,:sometag,prefix:sometag
|
||||||
library/xml.etree.elementtree,,:fictional,"<actors xmlns:fictional=""http://characters.example.com"""
|
library/xml.etree.elementtree,,:fictional,"<actors xmlns:fictional=""http://characters.example.com"""
|
||||||
|
|
|
|
@ -700,6 +700,17 @@ Changes in the Python API
|
||||||
argument ``os.scandir`` instead of ``os.listdir`` when listing the direcory
|
argument ``os.scandir`` instead of ``os.listdir`` when listing the direcory
|
||||||
is failed.
|
is failed.
|
||||||
|
|
||||||
|
* Support of nested sets and set operations in regular expressions as in
|
||||||
|
`Unicode Technical Standard #18`_ might be added in the future. This would
|
||||||
|
change the syntax, so to facilitate this change a :exc:`FutureWarning` will
|
||||||
|
be raised in ambiguous cases for the time being.
|
||||||
|
That include sets starting with a literal ``'['`` or containing literal
|
||||||
|
character sequences ``'--'``, ``'&&'``, ``'~~'``, and ``'||'``. To
|
||||||
|
avoid a warning escape them with a backslash.
|
||||||
|
(Contributed by Serhiy Storchaka in :issue:`30349`.)
|
||||||
|
|
||||||
|
.. _Unicode Technical Standard #18: https://unicode.org/reports/tr18/
|
||||||
|
|
||||||
|
|
||||||
Changes in the C API
|
Changes in the C API
|
||||||
--------------------
|
--------------------
|
||||||
|
|
|
@ -1354,15 +1354,14 @@ RouteComponentMarker = ValueTerminal('@', 'route-component-marker')
|
||||||
|
|
||||||
_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
|
_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
|
||||||
_non_atom_end_matcher = re.compile(r"[^{}]+".format(
|
_non_atom_end_matcher = re.compile(r"[^{}]+".format(
|
||||||
''.join(ATOM_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
|
re.escape(''.join(ATOM_ENDS)))).match
|
||||||
_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall
|
_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall
|
||||||
_non_token_end_matcher = re.compile(r"[^{}]+".format(
|
_non_token_end_matcher = re.compile(r"[^{}]+".format(
|
||||||
''.join(TOKEN_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
|
re.escape(''.join(TOKEN_ENDS)))).match
|
||||||
_non_attribute_end_matcher = re.compile(r"[^{}]+".format(
|
_non_attribute_end_matcher = re.compile(r"[^{}]+".format(
|
||||||
''.join(ATTRIBUTE_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
|
re.escape(''.join(ATTRIBUTE_ENDS)))).match
|
||||||
_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format(
|
_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format(
|
||||||
''.join(EXTENDED_ATTRIBUTE_ENDS).replace(
|
re.escape(''.join(EXTENDED_ATTRIBUTE_ENDS)))).match
|
||||||
'\\','\\\\').replace(']',r'\]'))).match
|
|
||||||
|
|
||||||
def _validate_xtext(xtext):
|
def _validate_xtext(xtext):
|
||||||
"""If input token contains ASCII non-printables, register a defect."""
|
"""If input token contains ASCII non-printables, register a defect."""
|
||||||
|
|
|
@ -251,8 +251,9 @@ def template(pattern, flags=0):
|
||||||
# SPECIAL_CHARS
|
# SPECIAL_CHARS
|
||||||
# closing ')', '}' and ']'
|
# closing ')', '}' and ']'
|
||||||
# '-' (a range in character set)
|
# '-' (a range in character set)
|
||||||
|
# '&', '~', (extended character set operations)
|
||||||
# '#' (comment) and WHITESPACE (ignored) in verbose mode
|
# '#' (comment) and WHITESPACE (ignored) in verbose mode
|
||||||
_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.# \t\n\r\v\f'}
|
_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f'}
|
||||||
|
|
||||||
def escape(pattern):
|
def escape(pattern):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -517,6 +517,12 @@ def _parse(source, state, verbose, nested, first=False):
|
||||||
setappend = set.append
|
setappend = set.append
|
||||||
## if sourcematch(":"):
|
## if sourcematch(":"):
|
||||||
## pass # handle character classes
|
## pass # handle character classes
|
||||||
|
if source.next == '[':
|
||||||
|
import warnings
|
||||||
|
warnings.warn(
|
||||||
|
'Possible nested set at position %d' % source.tell(),
|
||||||
|
FutureWarning, stacklevel=nested + 6
|
||||||
|
)
|
||||||
negate = sourcematch("^")
|
negate = sourcematch("^")
|
||||||
# check remaining characters
|
# check remaining characters
|
||||||
while True:
|
while True:
|
||||||
|
@ -529,6 +535,17 @@ def _parse(source, state, verbose, nested, first=False):
|
||||||
elif this[0] == "\\":
|
elif this[0] == "\\":
|
||||||
code1 = _class_escape(source, this)
|
code1 = _class_escape(source, this)
|
||||||
else:
|
else:
|
||||||
|
if set and this in '-&~|' and source.next == this:
|
||||||
|
import warnings
|
||||||
|
warnings.warn(
|
||||||
|
'Possible set %s at position %d' % (
|
||||||
|
'difference' if this == '-' else
|
||||||
|
'intersection' if this == '&' else
|
||||||
|
'symmetric difference' if this == '~' else
|
||||||
|
'union',
|
||||||
|
source.tell() - 1),
|
||||||
|
FutureWarning, stacklevel=nested + 6
|
||||||
|
)
|
||||||
code1 = LITERAL, _ord(this)
|
code1 = LITERAL, _ord(this)
|
||||||
if sourcematch("-"):
|
if sourcematch("-"):
|
||||||
# potential range
|
# potential range
|
||||||
|
@ -545,6 +562,13 @@ def _parse(source, state, verbose, nested, first=False):
|
||||||
if that[0] == "\\":
|
if that[0] == "\\":
|
||||||
code2 = _class_escape(source, that)
|
code2 = _class_escape(source, that)
|
||||||
else:
|
else:
|
||||||
|
if that == '-':
|
||||||
|
import warnings
|
||||||
|
warnings.warn(
|
||||||
|
'Possible set difference at position %d' % (
|
||||||
|
source.tell() - 2),
|
||||||
|
FutureWarning, stacklevel=nested + 6
|
||||||
|
)
|
||||||
code2 = LITERAL, _ord(that)
|
code2 = LITERAL, _ord(that)
|
||||||
if code1[0] != LITERAL or code2[0] != LITERAL:
|
if code1[0] != LITERAL or code2[0] != LITERAL:
|
||||||
msg = "bad character range %s-%s" % (this, that)
|
msg = "bad character range %s-%s" % (this, that)
|
||||||
|
|
|
@ -914,6 +914,51 @@ class ReTests(unittest.TestCase):
|
||||||
self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")
|
self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")
|
||||||
self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb")
|
self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb")
|
||||||
|
|
||||||
|
def test_possible_set_operations(self):
|
||||||
|
s = bytes(range(128)).decode()
|
||||||
|
with self.assertWarns(FutureWarning):
|
||||||
|
p = re.compile(r'[0-9--1]')
|
||||||
|
self.assertEqual(p.findall(s), list('-./0123456789'))
|
||||||
|
self.assertEqual(re.findall(r'[--1]', s), list('-./01'))
|
||||||
|
with self.assertWarns(FutureWarning):
|
||||||
|
p = re.compile(r'[%--1]')
|
||||||
|
self.assertEqual(p.findall(s), list("%&'()*+,-1"))
|
||||||
|
with self.assertWarns(FutureWarning):
|
||||||
|
p = re.compile(r'[%--]')
|
||||||
|
self.assertEqual(p.findall(s), list("%&'()*+,-"))
|
||||||
|
|
||||||
|
with self.assertWarns(FutureWarning):
|
||||||
|
p = re.compile(r'[0-9&&1]')
|
||||||
|
self.assertEqual(p.findall(s), list('&0123456789'))
|
||||||
|
with self.assertWarns(FutureWarning):
|
||||||
|
p = re.compile(r'[\d&&1]')
|
||||||
|
self.assertEqual(p.findall(s), list('&0123456789'))
|
||||||
|
self.assertEqual(re.findall(r'[&&1]', s), list('&1'))
|
||||||
|
|
||||||
|
with self.assertWarns(FutureWarning):
|
||||||
|
p = re.compile(r'[0-9||a]')
|
||||||
|
self.assertEqual(p.findall(s), list('0123456789a|'))
|
||||||
|
with self.assertWarns(FutureWarning):
|
||||||
|
p = re.compile(r'[\d||a]')
|
||||||
|
self.assertEqual(p.findall(s), list('0123456789a|'))
|
||||||
|
self.assertEqual(re.findall(r'[||1]', s), list('1|'))
|
||||||
|
|
||||||
|
with self.assertWarns(FutureWarning):
|
||||||
|
p = re.compile(r'[0-9~~1]')
|
||||||
|
self.assertEqual(p.findall(s), list('0123456789~'))
|
||||||
|
with self.assertWarns(FutureWarning):
|
||||||
|
p = re.compile(r'[\d~~1]')
|
||||||
|
self.assertEqual(p.findall(s), list('0123456789~'))
|
||||||
|
self.assertEqual(re.findall(r'[~~1]', s), list('1~'))
|
||||||
|
|
||||||
|
with self.assertWarns(FutureWarning):
|
||||||
|
p = re.compile(r'[[0-9]|]')
|
||||||
|
self.assertEqual(p.findall(s), list('0123456789[]'))
|
||||||
|
|
||||||
|
with self.assertWarns(FutureWarning):
|
||||||
|
p = re.compile(r'[[:digit:]|]')
|
||||||
|
self.assertEqual(p.findall(s), list(':[]dgit'))
|
||||||
|
|
||||||
def test_search_coverage(self):
|
def test_search_coverage(self):
|
||||||
self.assertEqual(re.search(r"\s(b)", " b").group(1), "b")
|
self.assertEqual(re.search(r"\s(b)", " b").group(1), "b")
|
||||||
self.assertEqual(re.search(r"a\s", "a ").group(0), "a ")
|
self.assertEqual(re.search(r"a\s", "a ").group(0), "a ")
|
||||||
|
@ -932,7 +977,7 @@ class ReTests(unittest.TestCase):
|
||||||
self.assertEqual(m.group(), match)
|
self.assertEqual(m.group(), match)
|
||||||
self.assertEqual(m.span(), span)
|
self.assertEqual(m.span(), span)
|
||||||
|
|
||||||
LITERAL_CHARS = string.ascii_letters + string.digits + '!"%&\',/:;<=>@_`~'
|
LITERAL_CHARS = string.ascii_letters + string.digits + '!"%\',/:;<=>@_`'
|
||||||
|
|
||||||
def test_re_escape(self):
|
def test_re_escape(self):
|
||||||
p = ''.join(chr(i) for i in range(256))
|
p = ''.join(chr(i) for i in range(256))
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
FutureWarning is now emitted if a regular expression contains character set
|
||||||
|
constructs that will change semantically in the future (nested sets and set
|
||||||
|
operations).
|
Loading…
Reference in New Issue