gh-91760: More strict rules for numerical group references and group names in RE (GH-91792)

Only sequence of ASCII digits is now accepted as a numerical reference. The group name in bytes patterns and replacement strings can now only contain ASCII letters and digits and underscore.
2022-05-08 19:19:29 +03:00 · 2022-05-08 19:19:29 +03:00 · a84a56d80f
parent 7b024e3a3f
commit a84a56d80f
5 changed files with 62 additions and 91 deletions
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@ -395,7 +395,8 @@ The special characters are:
 ``(?P<name>...)``
   Similar to regular parentheses, but the substring matched by the group is
   accessible via the symbolic group name *name*.  Group names must be valid
-   Python identifiers, and each group name must be defined only once within a
+   Python identifiers, and in bytes patterns they must contain only characters
+   in the ASCII range.  Each group name must be defined only once within a
   regular expression.  A symbolic group is also a numbered group, just as if
   the group were not named.

@ -417,8 +418,9 @@ The special characters are:
   |                                       | * ``\1``                         |
   +---------------------------------------+----------------------------------+

-   .. deprecated:: 3.11
-      Group names containing non-ASCII characters in bytes patterns.
+   .. versionchanged:: 3.12
+      In bytes patterns group names must contain only characters in
+      the ASCII range.

 .. index:: single: (?P=; in regular expressions

@ -489,8 +491,8 @@ The special characters are:
   will match with ``'<user@host.com>'`` as well as ``'user@host.com'``, but
   not with ``'<user@host.com'`` nor ``'user@host.com>'``.

-   .. deprecated:: 3.11
-      Group *id* containing anything except ASCII digits.
+   .. versionchanged:: 3.12
+      Group *id* can only contain ASCII digits.


 The special sequences consist of ``'\'`` and a character from the list below.
@ -1001,9 +1003,10 @@ form.
      Empty matches for the pattern are replaced when adjacent to a previous
      non-empty match.

-   .. deprecated:: 3.11
-      Group *id* containing anything except ASCII digits.
-      Group names containing non-ASCII characters in bytes replacement strings.
+   .. versionchanged:: 3.12
+      Group *id* can only contain ASCII digits.
+      In bytes replacement strings group names must contain only characters
+      in the ASCII range.


 .. function:: subn(pattern, repl, string, count=0, flags=0)
--- a/Doc/whatsnew/3.12.rst
+++ b/Doc/whatsnew/3.12.rst
@ -114,3 +114,13 @@ Porting to Python 3.12

 This section lists previously described changes and other bugfixes
 that may require changes to your code.
+
+Changes in the Python API
+-------------------------
+
+* More strict rules are now applied for numerical group references and
+  group names in regular expressions.
+  Only sequence of ASCII digits is now accepted as a numerical reference.
+  The group name in bytes patterns and replacement strings can now only
+  contain ASCII letters and digits and underscore.
+  (Contributed by Serhiy Storchaka in :gh:`91760`.)
--- a/Lib/re/_parser.py
+++ b/Lib/re/_parser.py
@ -291,17 +291,13 @@ class Tokenizer:
            msg = msg.encode('ascii', 'backslashreplace').decode('ascii')
        return error(msg, self.string, self.tell() - offset)

-    def checkgroupname(self, name, offset, nested):
+    def checkgroupname(self, name, offset):
+        if not (self.istext or name.isascii()):
+            msg = "bad character in group name %a" % name
+            raise self.error(msg, len(name) + offset)
        if not name.isidentifier():
            msg = "bad character in group name %r" % name
            raise self.error(msg, len(name) + offset)
-        if not (self.istext or name.isascii()):
-            import warnings
-            warnings.warn(
-                "bad character in group name %a at position %d" %
-                (name, self.tell() - len(name) - offset),
-                DeprecationWarning, stacklevel=nested + 7
-            )

 def _class_escape(source, escape):
    # handle escape code inside character class
@ -717,11 +713,11 @@ def _parse(source, state, verbose, nested, first=False):
                    if sourcematch("<"):
                        # named group: skip forward to end of name
                        name = source.getuntil(">", "group name")
-                        source.checkgroupname(name, 1, nested)
+                        source.checkgroupname(name, 1)
                    elif sourcematch("="):
                        # named backreference
                        name = source.getuntil(")", "group name")
-                        source.checkgroupname(name, 1, nested)
+                        source.checkgroupname(name, 1)
                        gid = state.groupdict.get(name)
                        if gid is None:
                            msg = "unknown group name %r" % name
@ -782,20 +778,14 @@ def _parse(source, state, verbose, nested, first=False):
                elif char == "(":
                    # conditional backreference group
                    condname = source.getuntil(")", "group name")
-                    if condname.isidentifier():
-                        source.checkgroupname(condname, 1, nested)
+                    if not (condname.isdecimal() and condname.isascii()):
+                        source.checkgroupname(condname, 1)
                        condgroup = state.groupdict.get(condname)
                        if condgroup is None:
                            msg = "unknown group name %r" % condname
                            raise source.error(msg, len(condname) + 1)
                    else:
-                        try:
-                            condgroup = int(condname)
-                            if condgroup < 0:
-                                raise ValueError
-                        except ValueError:
-                            msg = "bad character in group name %r" % condname
-                            raise source.error(msg, len(condname) + 1) from None
+                        condgroup = int(condname)
                        if not condgroup:
                            raise source.error("bad group number",
                                               len(condname) + 1)
@ -1022,20 +1012,14 @@ def parse_template(source, state):
                if not s.match("<"):
                    raise s.error("missing <")
                name = s.getuntil(">", "group name")
-                if name.isidentifier():
-                    s.checkgroupname(name, 1, -1)
+                if not (name.isdecimal() and name.isascii()):
+                    s.checkgroupname(name, 1)
                    try:
                        index = groupindex[name]
                    except KeyError:
                        raise IndexError("unknown group name %r" % name) from None
                else:
-                    try:
-                        index = int(name)
-                        if index < 0:
-                            raise ValueError
-                    except ValueError:
-                        raise s.error("bad character in group name %r" % name,
-                                      len(name) + 1) from None
+                    index = int(name)
                    if index >= MAXGROUPS:
                        raise s.error("invalid group reference %d" % index,
                                      len(name) + 1)
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@ -275,21 +275,12 @@ class ReTests(unittest.TestCase):
        self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
        self.checkPatternError('(?P=©)', "bad character in group name '©'", 4)
        self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3)
-        with self.assertWarnsRegex(DeprecationWarning,
-                                   r"bad character in group name '\\xc2\\xb5' "
-                                   r"at position 4") as w:
-            re.compile(b'(?P<\xc2\xb5>x)')
-        self.assertEqual(w.filename, __file__)
-        with self.assertWarnsRegex(DeprecationWarning,
-                                   r"bad character in group name '\\xc2\\xb5' "
-                                   r"at position 4"):
-            self.checkPatternError(b'(?P=\xc2\xb5)',
-                                   r"unknown group name '\xc2\xb5'", 4)
-        with self.assertWarnsRegex(DeprecationWarning,
-                                   r"bad character in group name '\\xc2\\xb5' "
-                                   r"at position 3"):
-            self.checkPatternError(b'(?(\xc2\xb5)y)',
-                                   r"unknown group name '\xc2\xb5'", 3)
+        self.checkPatternError(b'(?P<\xc2\xb5>x)',
+                               r"bad character in group name '\xc2\xb5'", 4)
+        self.checkPatternError(b'(?P=\xc2\xb5)',
+                               r"bad character in group name '\xc2\xb5'", 4)
+        self.checkPatternError(b'(?(\xc2\xb5)y)',
+                               r"bad character in group name '\xc2\xb5'", 3)

    def test_symbolic_refs(self):
        self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
@ -322,35 +313,22 @@ class ReTests(unittest.TestCase):
            re.sub('(?P<a>x)', r'\g<ab>', 'xx')
        self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
                                "bad character in group name '-1'", 3)
-        with self.assertWarnsRegex(DeprecationWarning,
-                                   r"bad character in group name '\+1' "
-                                   r"at position 3") as w:
-            re.sub('(?P<a>x)', r'\g<+1>', 'xx')
-        self.assertEqual(w.filename, __file__)
-        with self.assertWarnsRegex(DeprecationWarning,
-                                   r"bad character in group name '1_0' "
-                                   r"at position 3"):
-            re.sub('()'*10, r'\g<1_0>', 'xx')
-        with self.assertWarnsRegex(DeprecationWarning,
-                                   r"bad character in group name ' 1 ' "
-                                   r"at position 3"):
-            re.sub('(?P<a>x)', r'\g< 1 >', 'xx')
+        self.checkTemplateError('(?P<a>x)', r'\g<+1>', 'xx',
+                                "bad character in group name '+1'", 3)
+        self.checkTemplateError('()'*10, r'\g<1_0>', 'xx',
+                                "bad character in group name '1_0'", 3)
+        self.checkTemplateError('(?P<a>x)', r'\g< 1 >', 'xx',
+                                "bad character in group name ' 1 '", 3)
        self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
                                "bad character in group name '©'", 3)
-        with self.assertWarnsRegex(DeprecationWarning,
-                                   r"bad character in group name '\\xc2\\xb5' "
-                                   r"at position 3") as w:
-            with self.assertRaisesRegex(IndexError, "unknown group name '\xc2\xb5'"):
-                re.sub(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx')
-        self.assertEqual(w.filename, __file__)
+        self.checkTemplateError(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx',
+                                r"bad character in group name '\xc2\xb5'", 3)
        self.checkTemplateError('(?P<a>x)', r'\g<㊀>', 'xx',
                                "bad character in group name '㊀'", 3)
        self.checkTemplateError('(?P<a>x)', r'\g<¹>', 'xx',
                                "bad character in group name '¹'", 3)
-        with self.assertWarnsRegex(DeprecationWarning,
-                                   r"bad character in group name '१' "
-                                   r"at position 3"):
-            re.sub('(?P<a>x)', r'\g<१>', 'xx')
+        self.checkTemplateError('(?P<a>x)', r'\g<१>', 'xx',
+                                "bad character in group name '१'", 3)

    def test_re_subn(self):
        self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
@ -616,27 +594,18 @@ class ReTests(unittest.TestCase):
        self.checkPatternError(r'(?P<a>)(?(0)a|b)', 'bad group number', 10)
        self.checkPatternError(r'()(?(-1)a|b)',
                               "bad character in group name '-1'", 5)
-        with self.assertWarnsRegex(DeprecationWarning,
-                                   r"bad character in group name '\+1' "
-                                   r"at position 5") as w:
-            re.compile(r'()(?(+1)a|b)')
-        self.assertEqual(w.filename, __file__)
-        with self.assertWarnsRegex(DeprecationWarning,
-                                   r"bad character in group name '1_0' "
-                                   r"at position 23"):
-            re.compile(r'()'*10 + r'(?(1_0)a|b)')
-        with self.assertWarnsRegex(DeprecationWarning,
-                                   r"bad character in group name ' 1 ' "
-                                   r"at position 5"):
-            re.compile(r'()(?( 1 )a|b)')
+        self.checkPatternError(r'()(?(+1)a|b)',
+                               "bad character in group name '+1'", 5)
+        self.checkPatternError(r'()'*10 + r'(?(1_0)a|b)',
+                               "bad character in group name '1_0'", 23)
+        self.checkPatternError(r'()(?( 1 )a|b)',
+                               "bad character in group name ' 1 '", 5)
        self.checkPatternError(r'()(?(㊀)a|b)',
                               "bad character in group name '㊀'", 5)
        self.checkPatternError(r'()(?(¹)a|b)',
                               "bad character in group name '¹'", 5)
-        with self.assertWarnsRegex(DeprecationWarning,
-                                   r"bad character in group name '१' "
-                                   r"at position 5"):
-            re.compile(r'()(?(१)a|b)')
+        self.checkPatternError(r'()(?(१)a|b)',
+                               "bad character in group name '१'", 5)
        self.checkPatternError(r'()(?(1',
                               "missing ), unterminated name", 5)
        self.checkPatternError(r'()(?(1)a',
--- a/Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst
+++ b/Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst
@ -0,0 +1,5 @@
+Apply more strict rules for numerical group references and group names in
+regular expressions. Only sequence of ASCII digits is now accepted as
+a numerical reference. The group name in
+bytes patterns and replacement strings can now only contain ASCII letters
+and digits and underscore.