mirror of https://github.com/python/cpython
gh-91760: More strict rules for numerical group references and group names in RE (GH-91792)
Only sequence of ASCII digits is now accepted as a numerical reference. The group name in bytes patterns and replacement strings can now only contain ASCII letters and digits and underscore.
This commit is contained in:
parent
7b024e3a3f
commit
a84a56d80f
|
@ -395,7 +395,8 @@ The special characters are:
|
|||
``(?P<name>...)``
|
||||
Similar to regular parentheses, but the substring matched by the group is
|
||||
accessible via the symbolic group name *name*. Group names must be valid
|
||||
Python identifiers, and each group name must be defined only once within a
|
||||
Python identifiers, and in bytes patterns they must contain only characters
|
||||
in the ASCII range. Each group name must be defined only once within a
|
||||
regular expression. A symbolic group is also a numbered group, just as if
|
||||
the group were not named.
|
||||
|
||||
|
@ -417,8 +418,9 @@ The special characters are:
|
|||
| | * ``\1`` |
|
||||
+---------------------------------------+----------------------------------+
|
||||
|
||||
.. deprecated:: 3.11
|
||||
Group names containing non-ASCII characters in bytes patterns.
|
||||
.. versionchanged:: 3.12
|
||||
In bytes patterns group names must contain only characters in
|
||||
the ASCII range.
|
||||
|
||||
.. index:: single: (?P=; in regular expressions
|
||||
|
||||
|
@ -489,8 +491,8 @@ The special characters are:
|
|||
will match with ``'<user@host.com>'`` as well as ``'user@host.com'``, but
|
||||
not with ``'<user@host.com'`` nor ``'user@host.com>'``.
|
||||
|
||||
.. deprecated:: 3.11
|
||||
Group *id* containing anything except ASCII digits.
|
||||
.. versionchanged:: 3.12
|
||||
Group *id* can only contain ASCII digits.
|
||||
|
||||
|
||||
The special sequences consist of ``'\'`` and a character from the list below.
|
||||
|
@ -1001,9 +1003,10 @@ form.
|
|||
Empty matches for the pattern are replaced when adjacent to a previous
|
||||
non-empty match.
|
||||
|
||||
.. deprecated:: 3.11
|
||||
Group *id* containing anything except ASCII digits.
|
||||
Group names containing non-ASCII characters in bytes replacement strings.
|
||||
.. versionchanged:: 3.12
|
||||
Group *id* can only contain ASCII digits.
|
||||
In bytes replacement strings group names must contain only characters
|
||||
in the ASCII range.
|
||||
|
||||
|
||||
.. function:: subn(pattern, repl, string, count=0, flags=0)
|
||||
|
|
|
@ -114,3 +114,13 @@ Porting to Python 3.12
|
|||
|
||||
This section lists previously described changes and other bugfixes
|
||||
that may require changes to your code.
|
||||
|
||||
Changes in the Python API
|
||||
-------------------------
|
||||
|
||||
* More strict rules are now applied for numerical group references and
|
||||
group names in regular expressions.
|
||||
Only sequence of ASCII digits is now accepted as a numerical reference.
|
||||
The group name in bytes patterns and replacement strings can now only
|
||||
contain ASCII letters and digits and underscore.
|
||||
(Contributed by Serhiy Storchaka in :gh:`91760`.)
|
||||
|
|
|
@ -291,17 +291,13 @@ class Tokenizer:
|
|||
msg = msg.encode('ascii', 'backslashreplace').decode('ascii')
|
||||
return error(msg, self.string, self.tell() - offset)
|
||||
|
||||
def checkgroupname(self, name, offset, nested):
|
||||
def checkgroupname(self, name, offset):
|
||||
if not (self.istext or name.isascii()):
|
||||
msg = "bad character in group name %a" % name
|
||||
raise self.error(msg, len(name) + offset)
|
||||
if not name.isidentifier():
|
||||
msg = "bad character in group name %r" % name
|
||||
raise self.error(msg, len(name) + offset)
|
||||
if not (self.istext or name.isascii()):
|
||||
import warnings
|
||||
warnings.warn(
|
||||
"bad character in group name %a at position %d" %
|
||||
(name, self.tell() - len(name) - offset),
|
||||
DeprecationWarning, stacklevel=nested + 7
|
||||
)
|
||||
|
||||
def _class_escape(source, escape):
|
||||
# handle escape code inside character class
|
||||
|
@ -717,11 +713,11 @@ def _parse(source, state, verbose, nested, first=False):
|
|||
if sourcematch("<"):
|
||||
# named group: skip forward to end of name
|
||||
name = source.getuntil(">", "group name")
|
||||
source.checkgroupname(name, 1, nested)
|
||||
source.checkgroupname(name, 1)
|
||||
elif sourcematch("="):
|
||||
# named backreference
|
||||
name = source.getuntil(")", "group name")
|
||||
source.checkgroupname(name, 1, nested)
|
||||
source.checkgroupname(name, 1)
|
||||
gid = state.groupdict.get(name)
|
||||
if gid is None:
|
||||
msg = "unknown group name %r" % name
|
||||
|
@ -782,20 +778,14 @@ def _parse(source, state, verbose, nested, first=False):
|
|||
elif char == "(":
|
||||
# conditional backreference group
|
||||
condname = source.getuntil(")", "group name")
|
||||
if condname.isidentifier():
|
||||
source.checkgroupname(condname, 1, nested)
|
||||
if not (condname.isdecimal() and condname.isascii()):
|
||||
source.checkgroupname(condname, 1)
|
||||
condgroup = state.groupdict.get(condname)
|
||||
if condgroup is None:
|
||||
msg = "unknown group name %r" % condname
|
||||
raise source.error(msg, len(condname) + 1)
|
||||
else:
|
||||
try:
|
||||
condgroup = int(condname)
|
||||
if condgroup < 0:
|
||||
raise ValueError
|
||||
except ValueError:
|
||||
msg = "bad character in group name %r" % condname
|
||||
raise source.error(msg, len(condname) + 1) from None
|
||||
condgroup = int(condname)
|
||||
if not condgroup:
|
||||
raise source.error("bad group number",
|
||||
len(condname) + 1)
|
||||
|
@ -1022,20 +1012,14 @@ def parse_template(source, state):
|
|||
if not s.match("<"):
|
||||
raise s.error("missing <")
|
||||
name = s.getuntil(">", "group name")
|
||||
if name.isidentifier():
|
||||
s.checkgroupname(name, 1, -1)
|
||||
if not (name.isdecimal() and name.isascii()):
|
||||
s.checkgroupname(name, 1)
|
||||
try:
|
||||
index = groupindex[name]
|
||||
except KeyError:
|
||||
raise IndexError("unknown group name %r" % name) from None
|
||||
else:
|
||||
try:
|
||||
index = int(name)
|
||||
if index < 0:
|
||||
raise ValueError
|
||||
except ValueError:
|
||||
raise s.error("bad character in group name %r" % name,
|
||||
len(name) + 1) from None
|
||||
index = int(name)
|
||||
if index >= MAXGROUPS:
|
||||
raise s.error("invalid group reference %d" % index,
|
||||
len(name) + 1)
|
||||
|
|
|
@ -275,21 +275,12 @@ class ReTests(unittest.TestCase):
|
|||
self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
|
||||
self.checkPatternError('(?P=©)', "bad character in group name '©'", 4)
|
||||
self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3)
|
||||
with self.assertWarnsRegex(DeprecationWarning,
|
||||
r"bad character in group name '\\xc2\\xb5' "
|
||||
r"at position 4") as w:
|
||||
re.compile(b'(?P<\xc2\xb5>x)')
|
||||
self.assertEqual(w.filename, __file__)
|
||||
with self.assertWarnsRegex(DeprecationWarning,
|
||||
r"bad character in group name '\\xc2\\xb5' "
|
||||
r"at position 4"):
|
||||
self.checkPatternError(b'(?P=\xc2\xb5)',
|
||||
r"unknown group name '\xc2\xb5'", 4)
|
||||
with self.assertWarnsRegex(DeprecationWarning,
|
||||
r"bad character in group name '\\xc2\\xb5' "
|
||||
r"at position 3"):
|
||||
self.checkPatternError(b'(?(\xc2\xb5)y)',
|
||||
r"unknown group name '\xc2\xb5'", 3)
|
||||
self.checkPatternError(b'(?P<\xc2\xb5>x)',
|
||||
r"bad character in group name '\xc2\xb5'", 4)
|
||||
self.checkPatternError(b'(?P=\xc2\xb5)',
|
||||
r"bad character in group name '\xc2\xb5'", 4)
|
||||
self.checkPatternError(b'(?(\xc2\xb5)y)',
|
||||
r"bad character in group name '\xc2\xb5'", 3)
|
||||
|
||||
def test_symbolic_refs(self):
|
||||
self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
|
||||
|
@ -322,35 +313,22 @@ class ReTests(unittest.TestCase):
|
|||
re.sub('(?P<a>x)', r'\g<ab>', 'xx')
|
||||
self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
|
||||
"bad character in group name '-1'", 3)
|
||||
with self.assertWarnsRegex(DeprecationWarning,
|
||||
r"bad character in group name '\+1' "
|
||||
r"at position 3") as w:
|
||||
re.sub('(?P<a>x)', r'\g<+1>', 'xx')
|
||||
self.assertEqual(w.filename, __file__)
|
||||
with self.assertWarnsRegex(DeprecationWarning,
|
||||
r"bad character in group name '1_0' "
|
||||
r"at position 3"):
|
||||
re.sub('()'*10, r'\g<1_0>', 'xx')
|
||||
with self.assertWarnsRegex(DeprecationWarning,
|
||||
r"bad character in group name ' 1 ' "
|
||||
r"at position 3"):
|
||||
re.sub('(?P<a>x)', r'\g< 1 >', 'xx')
|
||||
self.checkTemplateError('(?P<a>x)', r'\g<+1>', 'xx',
|
||||
"bad character in group name '+1'", 3)
|
||||
self.checkTemplateError('()'*10, r'\g<1_0>', 'xx',
|
||||
"bad character in group name '1_0'", 3)
|
||||
self.checkTemplateError('(?P<a>x)', r'\g< 1 >', 'xx',
|
||||
"bad character in group name ' 1 '", 3)
|
||||
self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
|
||||
"bad character in group name '©'", 3)
|
||||
with self.assertWarnsRegex(DeprecationWarning,
|
||||
r"bad character in group name '\\xc2\\xb5' "
|
||||
r"at position 3") as w:
|
||||
with self.assertRaisesRegex(IndexError, "unknown group name '\xc2\xb5'"):
|
||||
re.sub(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx')
|
||||
self.assertEqual(w.filename, __file__)
|
||||
self.checkTemplateError(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx',
|
||||
r"bad character in group name '\xc2\xb5'", 3)
|
||||
self.checkTemplateError('(?P<a>x)', r'\g<㊀>', 'xx',
|
||||
"bad character in group name '㊀'", 3)
|
||||
self.checkTemplateError('(?P<a>x)', r'\g<¹>', 'xx',
|
||||
"bad character in group name '¹'", 3)
|
||||
with self.assertWarnsRegex(DeprecationWarning,
|
||||
r"bad character in group name '१' "
|
||||
r"at position 3"):
|
||||
re.sub('(?P<a>x)', r'\g<१>', 'xx')
|
||||
self.checkTemplateError('(?P<a>x)', r'\g<१>', 'xx',
|
||||
"bad character in group name '१'", 3)
|
||||
|
||||
def test_re_subn(self):
|
||||
self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
|
||||
|
@ -616,27 +594,18 @@ class ReTests(unittest.TestCase):
|
|||
self.checkPatternError(r'(?P<a>)(?(0)a|b)', 'bad group number', 10)
|
||||
self.checkPatternError(r'()(?(-1)a|b)',
|
||||
"bad character in group name '-1'", 5)
|
||||
with self.assertWarnsRegex(DeprecationWarning,
|
||||
r"bad character in group name '\+1' "
|
||||
r"at position 5") as w:
|
||||
re.compile(r'()(?(+1)a|b)')
|
||||
self.assertEqual(w.filename, __file__)
|
||||
with self.assertWarnsRegex(DeprecationWarning,
|
||||
r"bad character in group name '1_0' "
|
||||
r"at position 23"):
|
||||
re.compile(r'()'*10 + r'(?(1_0)a|b)')
|
||||
with self.assertWarnsRegex(DeprecationWarning,
|
||||
r"bad character in group name ' 1 ' "
|
||||
r"at position 5"):
|
||||
re.compile(r'()(?( 1 )a|b)')
|
||||
self.checkPatternError(r'()(?(+1)a|b)',
|
||||
"bad character in group name '+1'", 5)
|
||||
self.checkPatternError(r'()'*10 + r'(?(1_0)a|b)',
|
||||
"bad character in group name '1_0'", 23)
|
||||
self.checkPatternError(r'()(?( 1 )a|b)',
|
||||
"bad character in group name ' 1 '", 5)
|
||||
self.checkPatternError(r'()(?(㊀)a|b)',
|
||||
"bad character in group name '㊀'", 5)
|
||||
self.checkPatternError(r'()(?(¹)a|b)',
|
||||
"bad character in group name '¹'", 5)
|
||||
with self.assertWarnsRegex(DeprecationWarning,
|
||||
r"bad character in group name '१' "
|
||||
r"at position 5"):
|
||||
re.compile(r'()(?(१)a|b)')
|
||||
self.checkPatternError(r'()(?(१)a|b)',
|
||||
"bad character in group name '१'", 5)
|
||||
self.checkPatternError(r'()(?(1',
|
||||
"missing ), unterminated name", 5)
|
||||
self.checkPatternError(r'()(?(1)a',
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
Apply more strict rules for numerical group references and group names in
|
||||
regular expressions. Only sequence of ASCII digits is now accepted as
|
||||
a numerical reference. The group name in
|
||||
bytes patterns and replacement strings can now only contain ASCII letters
|
||||
and digits and underscore.
|
Loading…
Reference in New Issue