From b22273ec5d1992b0cbe078b887427ae9977dfb78 Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Fri, 13 Oct 2017 16:02:23 +0900 Subject: [PATCH] bpo-31672: Fix string.Template accidentally matched non-ASCII identifiers (GH-3872) Pattern `[a-z]` with `IGNORECASE` flag can match to some non-ASCII characters. Straightforward solution for this is using `IGNORECASE | ASCII` flag. But users may subclass `Template` and override only `idpattern`. So we want to avoid changing `Template.flags`. So this commit uses local flag `-i` for `idpattern` and change `[a-z]` to `[a-zA-Z]`. --- Doc/library/string.rst | 13 +++++++++++-- Lib/string.py | 6 +++++- Lib/test/test_string.py | 6 ++++++ .../2017-10-12-02-47-16.bpo-31672.DaOkVd.rst | 2 ++ 4 files changed, 24 insertions(+), 3 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2017-10-12-02-47-16.bpo-31672.DaOkVd.rst diff --git a/Doc/library/string.rst b/Doc/library/string.rst index 1a9b6309752..1076cdb2346 100644 --- a/Doc/library/string.rst +++ b/Doc/library/string.rst @@ -755,8 +755,17 @@ attributes: * *idpattern* -- This is the regular expression describing the pattern for non-braced placeholders. The default value is the regular expression - ``[_a-z][_a-z0-9]*``. If this is given and *braceidpattern* is ``None`` - this pattern will also apply to braced placeholders. + ``(?-i:[_a-zA-Z][_a-zA-Z0-9]*)``. If this is given and *braceidpattern* is + ``None`` this pattern will also apply to braced placeholders. + + .. note:: + + Since default *flags* is ``re.IGNORECASE``, pattern ``[a-z]`` can match + with some non-ASCII characters. That's why we use local ``-i`` flag here. + + While *flags* is kept to ``re.IGNORECASE`` for backward compatibility, + you can override it to ``0`` or ``re.IGNORECASE | re.ASCII`` when + subclassing. It's simple way to avoid unexpected match like above example. .. versionchanged:: 3.7 *braceidpattern* can be used to define separate patterns used inside and diff --git a/Lib/string.py b/Lib/string.py index b46e60c38f4..a3e6d91bb4a 100644 --- a/Lib/string.py +++ b/Lib/string.py @@ -79,7 +79,11 @@ class Template(metaclass=_TemplateMetaclass): """A string class for supporting $-substitutions.""" delimiter = '$' - idpattern = r'[_a-z][_a-z0-9]*' + # r'[a-z]' matches to non-ASCII letters when used with IGNORECASE, + # but without ASCII flag. We can't add re.ASCII to flags because of + # backward compatibility. So we use local -i flag and [a-zA-Z] pattern. + # See https://bugs.python.org/issue31672 + idpattern = r'(?-i:[_a-zA-Z][_a-zA-Z0-9]*)' braceidpattern = None flags = _re.IGNORECASE diff --git a/Lib/test/test_string.py b/Lib/test/test_string.py index 6e241ac72ab..3480459c282 100644 --- a/Lib/test/test_string.py +++ b/Lib/test/test_string.py @@ -270,6 +270,12 @@ class TestTemplate(unittest.TestCase): raises(ValueError, s.substitute, dict(who='tim')) s = Template('$who likes $100') raises(ValueError, s.substitute, dict(who='tim')) + # Template.idpattern should match to only ASCII characters. + # https://bugs.python.org/issue31672 + s = Template("$who likes $\u0131") # (DOTLESS I) + raises(ValueError, s.substitute, dict(who='tim')) + s = Template("$who likes $\u0130") # (LATIN CAPITAL LETTER I WITH DOT ABOVE) + raises(ValueError, s.substitute, dict(who='tim')) def test_idpattern_override(self): class PathPattern(Template): diff --git a/Misc/NEWS.d/next/Library/2017-10-12-02-47-16.bpo-31672.DaOkVd.rst b/Misc/NEWS.d/next/Library/2017-10-12-02-47-16.bpo-31672.DaOkVd.rst new file mode 100644 index 00000000000..b8de1f3b1db --- /dev/null +++ b/Misc/NEWS.d/next/Library/2017-10-12-02-47-16.bpo-31672.DaOkVd.rst @@ -0,0 +1,2 @@ +``idpattern`` in ``string.Template`` matched some non-ASCII characters. Now +it uses ``-i`` regular expression local flag to avoid non-ASCII characters.