diff --git a/Doc/howto/regex.rst b/Doc/howto/regex.rst index 9ae04d718d8..ad2c6ab7d54 100644 --- a/Doc/howto/regex.rst +++ b/Doc/howto/regex.rst @@ -1138,7 +1138,7 @@ Empty matches are replaced only when they're not adjacent to a previous match. If *replacement* is a string, any backslash escapes in it are processed. That is, ``\n`` is converted to a single newline character, ``\r`` is converted to a -carriage return, and so forth. Unknown escapes such as ``\j`` are left alone. +carriage return, and so forth. Unknown escapes such as ``\&`` are left alone. Backreferences, such as ``\6``, are replaced with the substring matched by the corresponding group in the RE. This lets you incorporate portions of the original text in the resulting replacement string. diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 0d305d58c92..888458449a1 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -438,6 +438,10 @@ three digits in length. .. versionchanged:: 3.3 The ``'\u'`` and ``'\U'`` escape sequences have been added. +.. deprecated-removed:: 3.5 3.6 + Unknown escapes consist of ``'\'`` and ASCII letter now raise a + deprecation warning and will be forbidden in Python 3.6. + .. seealso:: @@ -687,7 +691,7 @@ form. *string* is returned unchanged. *repl* can be a string or a function; if it is a string, any backslash escapes in it are processed. That is, ``\n`` is converted to a single newline character, ``\r`` is converted to a carriage return, and - so forth. Unknown escapes such as ``\j`` are left alone. Backreferences, such + so forth. Unknown escapes such as ``\&`` are left alone. Backreferences, such as ``\6``, are replaced with the substring matched by group 6 in the pattern. For example: @@ -732,6 +736,10 @@ form. .. versionchanged:: 3.5 Unmatched groups are replaced with an empty string. + .. deprecated-removed:: 3.5 3.6 + Unknown escapes consist of ``'\'`` and ASCII letter now raise a + deprecation warning and will be forbidden in Python 3.6. + .. function:: subn(pattern, repl, string, count=0, flags=0) diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 98afd7cb4a6..af729c30db1 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -21,6 +21,7 @@ DIGITS = frozenset("0123456789") OCTDIGITS = frozenset("01234567") HEXDIGITS = frozenset("0123456789abcdefABCDEF") +ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") WHITESPACE = frozenset(" \t\n\r\v\f") @@ -344,6 +345,10 @@ def _class_escape(source, escape): elif c in DIGITS: raise ValueError if len(escape) == 2: + if c in ASCIILETTERS: + import warnings + warnings.warn('bad escape %s' % escape, + DeprecationWarning, stacklevel=8) return LITERAL, ord(escape[1]) except ValueError: pass @@ -407,6 +412,10 @@ def _escape(source, escape, state): return GROUPREF, group raise ValueError if len(escape) == 2: + if c in ASCIILETTERS: + import warnings + warnings.warn('bad escape %s' % escape, + DeprecationWarning, stacklevel=8) return LITERAL, ord(escape[1]) except ValueError: pass @@ -903,7 +912,10 @@ def parse_template(source, pattern): try: this = chr(ESCAPES[this][1]) except KeyError: - pass + if c in ASCIILETTERS: + import warnings + warnings.warn('bad escape %s' % this, + DeprecationWarning, stacklevel=5) lappend(this) else: lappend(this) diff --git a/Lib/test/re_tests.py b/Lib/test/re_tests.py index 7f8075ec148..8c158f883bc 100755 --- a/Lib/test/re_tests.py +++ b/Lib/test/re_tests.py @@ -87,7 +87,7 @@ tests = [ (r'[\a][\b][\f][\n][\r][\t][\v]', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'), # NOTE: not an error under PCRE/PRE: (r'\u', '', SYNTAX_ERROR), # A Perl escape - (r'\c\e\g\h\i\j\k\m\o\p\q\y\z', 'ceghijkmopqyz', SUCCEED, 'found', 'ceghijkmopqyz'), + # (r'\c\e\g\h\i\j\k\m\o\p\q\y\z', 'ceghijkmopqyz', SUCCEED, 'found', 'ceghijkmopqyz'), (r'\xff', '\377', SUCCEED, 'found', chr(255)), # new \x semantics (r'\x00ffffffffffffff', '\377', FAIL, 'found', chr(255)), @@ -607,8 +607,8 @@ xyzabc # new \x semantics (r'\x00ff', '\377', FAIL), # (r'\x00ff', '\377', SUCCEED, 'found', chr(255)), - (r'\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'), - ('\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'), + (r'\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', '\t\n\v\r\f\a'), + ('\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', '\t\n\v\r\f\a'), (r'\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)), (r'[\t][\n][\v][\r][\f][\b]', '\t\n\v\r\f\b', SUCCEED, 'found', '\t\n\v\r\f\b'), diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 0fbf8c5a237..9af1e8aa2d2 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -100,11 +100,14 @@ class ReTests(unittest.TestCase): self.assertEqual(re.sub('(?Px)', '\g\g', 'xx'), 'xxxx') self.assertEqual(re.sub('(?Px)', '\g<1>\g<1>', 'xx'), 'xxxx') - self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'), - '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D') - self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a') - self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), - (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7))) + self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') + self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') + self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), + (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8))) + for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ': + with self.subTest(c): + with self.assertWarns(DeprecationWarning): + self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c) self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest') @@ -551,14 +554,23 @@ class ReTests(unittest.TestCase): self.assertEqual(re.match(r"\(", '(').group(), '(') self.assertIsNone(re.match(r"\(", ')')) self.assertEqual(re.match(r"\\", '\\').group(), '\\') - self.assertEqual(re.match(r"\y", 'y').group(), 'y') - self.assertIsNone(re.match(r"\y", 'z')) self.assertEqual(re.match(r"[\]]", ']').group(), ']') self.assertIsNone(re.match(r"[\]]", '[')) self.assertEqual(re.match(r"[a\-c]", '-').group(), '-') self.assertIsNone(re.match(r"[a\-c]", 'b')) self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^') self.assertIsNone(re.match(r"[\^a]+", 'b')) + re.purge() # for warnings + for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY': + with self.subTest(c): + with self.assertWarns(DeprecationWarning): + self.assertEqual(re.fullmatch('\\%c' % c, c).group(), c) + self.assertIsNone(re.match('\\%c' % c, 'a')) + for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ': + with self.subTest(c): + with self.assertWarns(DeprecationWarning): + self.assertEqual(re.fullmatch('[\\%c]' % c, c).group(), c) + self.assertIsNone(re.match('[\\%c]' % c, 'a')) def test_string_boundaries(self): # See http://bugs.python.org/issue10713 @@ -907,8 +919,10 @@ class ReTests(unittest.TestCase): self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i]))) self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0")) self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z")) - self.assertTrue(re.match(br"\u", b'u')) - self.assertTrue(re.match(br"\U", b'U')) + with self.assertWarns(DeprecationWarning): + self.assertTrue(re.match(br"\u1234", b'u1234')) + with self.assertWarns(DeprecationWarning): + self.assertTrue(re.match(br"\U00012345", b'U00012345')) self.assertTrue(re.match(br"\0", b"\000")) self.assertTrue(re.match(br"\08", b"\0008")) self.assertTrue(re.match(br"\01", b"\001")) @@ -928,8 +942,10 @@ class ReTests(unittest.TestCase): self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i]))) self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i]))) self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i]))) - self.assertTrue(re.match(br"[\u]", b'u')) - self.assertTrue(re.match(br"[\U]", b'U')) + with self.assertWarns(DeprecationWarning): + self.assertTrue(re.match(br"[\u1234]", b'u')) + with self.assertWarns(DeprecationWarning): + self.assertTrue(re.match(br"[\U00012345]", b'U')) self.assertRaises(re.error, re.match, br"[\567]", b"") self.assertRaises(re.error, re.match, br"[\911]", b"") self.assertRaises(re.error, re.match, br"[\x1z]", b"") @@ -1304,8 +1320,9 @@ class ReTests(unittest.TestCase): def test_bug_13899(self): # Issue #13899: re pattern r"[\A]" should work like "A" but matches # nothing. Ditto B and Z. - self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'), - ['A', 'B', '\b', 'C', 'Z']) + with self.assertWarns(DeprecationWarning): + self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'), + ['A', 'B', '\b', 'C', 'Z']) @bigmemtest(size=_2G, memuse=1) def test_large_search(self, size): diff --git a/Misc/NEWS b/Misc/NEWS index c7e80d5d79a..c1489983a3c 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -30,6 +30,10 @@ Core and Builtins Library ------- +- Issue #23622: Unknown escapes in regular expressions that consist of ``'\'`` + and ASCII letter now raise a deprecation warning and will be forbidden in + Python 3.6. + - Issue #23671: string.Template now allows to specify the "self" parameter as keyword argument. string.Formatter now allows to specify the "self" and the "format_string" parameters as keyword arguments.