Issue #23622: Unknown escapes in regular expressions that consist of ``'\'``

and ASCII letter now raise a deprecation warning and will be forbidden in
Python 3.6.
This commit is contained in:
Serhiy Storchaka 2015-03-24 22:58:14 +02:00
parent 793c14ea29
commit a54aae0683
6 changed files with 60 additions and 19 deletions

View File

@ -1138,7 +1138,7 @@ Empty matches are replaced only when they're not adjacent to a previous match.
If *replacement* is a string, any backslash escapes in it are processed. That If *replacement* is a string, any backslash escapes in it are processed. That
is, ``\n`` is converted to a single newline character, ``\r`` is converted to a is, ``\n`` is converted to a single newline character, ``\r`` is converted to a
carriage return, and so forth. Unknown escapes such as ``\j`` are left alone. carriage return, and so forth. Unknown escapes such as ``\&`` are left alone.
Backreferences, such as ``\6``, are replaced with the substring matched by the Backreferences, such as ``\6``, are replaced with the substring matched by the
corresponding group in the RE. This lets you incorporate portions of the corresponding group in the RE. This lets you incorporate portions of the
original text in the resulting replacement string. original text in the resulting replacement string.

View File

@ -438,6 +438,10 @@ three digits in length.
.. versionchanged:: 3.3 .. versionchanged:: 3.3
The ``'\u'`` and ``'\U'`` escape sequences have been added. The ``'\u'`` and ``'\U'`` escape sequences have been added.
.. deprecated-removed:: 3.5 3.6
Unknown escapes consist of ``'\'`` and ASCII letter now raise a
deprecation warning and will be forbidden in Python 3.6.
.. seealso:: .. seealso::
@ -687,7 +691,7 @@ form.
*string* is returned unchanged. *repl* can be a string or a function; if it is *string* is returned unchanged. *repl* can be a string or a function; if it is
a string, any backslash escapes in it are processed. That is, ``\n`` is a string, any backslash escapes in it are processed. That is, ``\n`` is
converted to a single newline character, ``\r`` is converted to a carriage return, and converted to a single newline character, ``\r`` is converted to a carriage return, and
so forth. Unknown escapes such as ``\j`` are left alone. Backreferences, such so forth. Unknown escapes such as ``\&`` are left alone. Backreferences, such
as ``\6``, are replaced with the substring matched by group 6 in the pattern. as ``\6``, are replaced with the substring matched by group 6 in the pattern.
For example: For example:
@ -732,6 +736,10 @@ form.
.. versionchanged:: 3.5 .. versionchanged:: 3.5
Unmatched groups are replaced with an empty string. Unmatched groups are replaced with an empty string.
.. deprecated-removed:: 3.5 3.6
Unknown escapes consist of ``'\'`` and ASCII letter now raise a
deprecation warning and will be forbidden in Python 3.6.
.. function:: subn(pattern, repl, string, count=0, flags=0) .. function:: subn(pattern, repl, string, count=0, flags=0)

View File

@ -21,6 +21,7 @@ DIGITS = frozenset("0123456789")
OCTDIGITS = frozenset("01234567") OCTDIGITS = frozenset("01234567")
HEXDIGITS = frozenset("0123456789abcdefABCDEF") HEXDIGITS = frozenset("0123456789abcdefABCDEF")
ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
WHITESPACE = frozenset(" \t\n\r\v\f") WHITESPACE = frozenset(" \t\n\r\v\f")
@ -344,6 +345,10 @@ def _class_escape(source, escape):
elif c in DIGITS: elif c in DIGITS:
raise ValueError raise ValueError
if len(escape) == 2: if len(escape) == 2:
if c in ASCIILETTERS:
import warnings
warnings.warn('bad escape %s' % escape,
DeprecationWarning, stacklevel=8)
return LITERAL, ord(escape[1]) return LITERAL, ord(escape[1])
except ValueError: except ValueError:
pass pass
@ -407,6 +412,10 @@ def _escape(source, escape, state):
return GROUPREF, group return GROUPREF, group
raise ValueError raise ValueError
if len(escape) == 2: if len(escape) == 2:
if c in ASCIILETTERS:
import warnings
warnings.warn('bad escape %s' % escape,
DeprecationWarning, stacklevel=8)
return LITERAL, ord(escape[1]) return LITERAL, ord(escape[1])
except ValueError: except ValueError:
pass pass
@ -903,7 +912,10 @@ def parse_template(source, pattern):
try: try:
this = chr(ESCAPES[this][1]) this = chr(ESCAPES[this][1])
except KeyError: except KeyError:
pass if c in ASCIILETTERS:
import warnings
warnings.warn('bad escape %s' % this,
DeprecationWarning, stacklevel=5)
lappend(this) lappend(this)
else: else:
lappend(this) lappend(this)

View File

@ -87,7 +87,7 @@ tests = [
(r'[\a][\b][\f][\n][\r][\t][\v]', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'), (r'[\a][\b][\f][\n][\r][\t][\v]', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'),
# NOTE: not an error under PCRE/PRE: # NOTE: not an error under PCRE/PRE:
(r'\u', '', SYNTAX_ERROR), # A Perl escape (r'\u', '', SYNTAX_ERROR), # A Perl escape
(r'\c\e\g\h\i\j\k\m\o\p\q\y\z', 'ceghijkmopqyz', SUCCEED, 'found', 'ceghijkmopqyz'), # (r'\c\e\g\h\i\j\k\m\o\p\q\y\z', 'ceghijkmopqyz', SUCCEED, 'found', 'ceghijkmopqyz'),
(r'\xff', '\377', SUCCEED, 'found', chr(255)), (r'\xff', '\377', SUCCEED, 'found', chr(255)),
# new \x semantics # new \x semantics
(r'\x00ffffffffffffff', '\377', FAIL, 'found', chr(255)), (r'\x00ffffffffffffff', '\377', FAIL, 'found', chr(255)),
@ -607,8 +607,8 @@ xyzabc
# new \x semantics # new \x semantics
(r'\x00ff', '\377', FAIL), (r'\x00ff', '\377', FAIL),
# (r'\x00ff', '\377', SUCCEED, 'found', chr(255)), # (r'\x00ff', '\377', SUCCEED, 'found', chr(255)),
(r'\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'), (r'\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', '\t\n\v\r\f\a'),
('\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'), ('\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', '\t\n\v\r\f\a'),
(r'\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)), (r'\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)),
(r'[\t][\n][\v][\r][\f][\b]', '\t\n\v\r\f\b', SUCCEED, 'found', '\t\n\v\r\f\b'), (r'[\t][\n][\v][\r][\f][\b]', '\t\n\v\r\f\b', SUCCEED, 'found', '\t\n\v\r\f\b'),

View File

@ -100,11 +100,14 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx') self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx') self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'), self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
'\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D') self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a') self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
(chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7))) for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
with self.subTest(c):
with self.assertWarns(DeprecationWarning):
self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest') self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
@ -551,14 +554,23 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match(r"\(", '(').group(), '(') self.assertEqual(re.match(r"\(", '(').group(), '(')
self.assertIsNone(re.match(r"\(", ')')) self.assertIsNone(re.match(r"\(", ')'))
self.assertEqual(re.match(r"\\", '\\').group(), '\\') self.assertEqual(re.match(r"\\", '\\').group(), '\\')
self.assertEqual(re.match(r"\y", 'y').group(), 'y')
self.assertIsNone(re.match(r"\y", 'z'))
self.assertEqual(re.match(r"[\]]", ']').group(), ']') self.assertEqual(re.match(r"[\]]", ']').group(), ']')
self.assertIsNone(re.match(r"[\]]", '[')) self.assertIsNone(re.match(r"[\]]", '['))
self.assertEqual(re.match(r"[a\-c]", '-').group(), '-') self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
self.assertIsNone(re.match(r"[a\-c]", 'b')) self.assertIsNone(re.match(r"[a\-c]", 'b'))
self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^') self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
self.assertIsNone(re.match(r"[\^a]+", 'b')) self.assertIsNone(re.match(r"[\^a]+", 'b'))
re.purge() # for warnings
for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY':
with self.subTest(c):
with self.assertWarns(DeprecationWarning):
self.assertEqual(re.fullmatch('\\%c' % c, c).group(), c)
self.assertIsNone(re.match('\\%c' % c, 'a'))
for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ':
with self.subTest(c):
with self.assertWarns(DeprecationWarning):
self.assertEqual(re.fullmatch('[\\%c]' % c, c).group(), c)
self.assertIsNone(re.match('[\\%c]' % c, 'a'))
def test_string_boundaries(self): def test_string_boundaries(self):
# See http://bugs.python.org/issue10713 # See http://bugs.python.org/issue10713
@ -907,8 +919,10 @@ class ReTests(unittest.TestCase):
self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i]))) self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0")) self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z")) self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
self.assertTrue(re.match(br"\u", b'u')) with self.assertWarns(DeprecationWarning):
self.assertTrue(re.match(br"\U", b'U')) self.assertTrue(re.match(br"\u1234", b'u1234'))
with self.assertWarns(DeprecationWarning):
self.assertTrue(re.match(br"\U00012345", b'U00012345'))
self.assertTrue(re.match(br"\0", b"\000")) self.assertTrue(re.match(br"\0", b"\000"))
self.assertTrue(re.match(br"\08", b"\0008")) self.assertTrue(re.match(br"\08", b"\0008"))
self.assertTrue(re.match(br"\01", b"\001")) self.assertTrue(re.match(br"\01", b"\001"))
@ -928,8 +942,10 @@ class ReTests(unittest.TestCase):
self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i]))) self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i]))) self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i]))) self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
self.assertTrue(re.match(br"[\u]", b'u')) with self.assertWarns(DeprecationWarning):
self.assertTrue(re.match(br"[\U]", b'U')) self.assertTrue(re.match(br"[\u1234]", b'u'))
with self.assertWarns(DeprecationWarning):
self.assertTrue(re.match(br"[\U00012345]", b'U'))
self.assertRaises(re.error, re.match, br"[\567]", b"") self.assertRaises(re.error, re.match, br"[\567]", b"")
self.assertRaises(re.error, re.match, br"[\911]", b"") self.assertRaises(re.error, re.match, br"[\911]", b"")
self.assertRaises(re.error, re.match, br"[\x1z]", b"") self.assertRaises(re.error, re.match, br"[\x1z]", b"")
@ -1304,8 +1320,9 @@ class ReTests(unittest.TestCase):
def test_bug_13899(self): def test_bug_13899(self):
# Issue #13899: re pattern r"[\A]" should work like "A" but matches # Issue #13899: re pattern r"[\A]" should work like "A" but matches
# nothing. Ditto B and Z. # nothing. Ditto B and Z.
self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'), with self.assertWarns(DeprecationWarning):
['A', 'B', '\b', 'C', 'Z']) self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
['A', 'B', '\b', 'C', 'Z'])
@bigmemtest(size=_2G, memuse=1) @bigmemtest(size=_2G, memuse=1)
def test_large_search(self, size): def test_large_search(self, size):

View File

@ -30,6 +30,10 @@ Core and Builtins
Library Library
------- -------
- Issue #23622: Unknown escapes in regular expressions that consist of ``'\'``
and ASCII letter now raise a deprecation warning and will be forbidden in
Python 3.6.
- Issue #23671: string.Template now allows to specify the "self" parameter as - Issue #23671: string.Template now allows to specify the "self" parameter as
keyword argument. string.Formatter now allows to specify the "self" and keyword argument. string.Formatter now allows to specify the "self" and
the "format_string" parameters as keyword arguments. the "format_string" parameters as keyword arguments.