#27364: Deprecate invalid escape strings in str/byutes.

Patch by Emanuel Barry, reviewed by Serhiy Storchaka and Martin Panter.
2016-09-08 15:34:08 -04:00 · 2016-09-08 15:34:08 -04:00 · 110b6fecbb
parent 186122ead2
commit 110b6fecbb
7 changed files with 48 additions and 12 deletions
--- a/Doc/reference/lexical_analysis.rst
+++ b/Doc/reference/lexical_analysis.rst
@ -560,6 +560,10 @@ is more easily recognized as broken.)  It is also important to note that the
 escape sequences only recognized in string literals fall into the category of
 unrecognized escapes for bytes literals.

+   .. versionchanged:: 3.6
+      Unrecognized escape sequences produce a DeprecationWarning.  In
+      some future version of Python they will be a SyntaxError.
+
 Even in a raw literal, quotes can be escaped with a backslash, but the
 backslash remains in the result; for example, ``r"\""`` is a valid string
 literal consisting of two characters: a backslash and a double quote; ``r"\"``
--- a/Doc/whatsnew/3.6.rst
+++ b/Doc/whatsnew/3.6.rst
@ -952,6 +952,11 @@ Deprecated features
  parameter will be dropped in a future Python release and likely earlier
  through third party tools. See :issue:`27919` for details.

+* A backslash-character pair that is not a valid escape sequence now generates
+  a DeprecationWarning.  Although this will eventually become a SyntaxError,
+  that will not be for several Python releases.  (Contributed by Emanuel Barry
+  in :issue:`27364`.)
+

 Deprecated Python behavior
 --------------------------
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -1175,7 +1175,7 @@ class EscapeDecodeTest(unittest.TestCase):
        check(b"[\\\n]", b"[]")
        check(br'[\"]', b'["]')
        check(br"[\']", b"[']")
-        check(br"[\\]", br"[\]")
+        check(br"[\\]", b"[\\]")
        check(br"[\a]", b"[\x07]")
        check(br"[\b]", b"[\x08]")
        check(br"[\t]", b"[\x09]")
@ -1184,7 +1184,6 @@ class EscapeDecodeTest(unittest.TestCase):
        check(br"[\f]", b"[\x0c]")
        check(br"[\r]", b"[\x0d]")
        check(br"[\7]", b"[\x07]")
-        check(br"[\8]", br"[\8]")
        check(br"[\78]", b"[\x078]")
        check(br"[\41]", b"[!]")
        check(br"[\418]", b"[!8]")
@ -1192,12 +1191,18 @@ class EscapeDecodeTest(unittest.TestCase):
        check(br"[\1010]", b"[A0]")
        check(br"[\501]", b"[A]")
        check(br"[\x41]", b"[A]")
-        check(br"[\X41]", br"[\X41]")
        check(br"[\x410]", b"[A0]")
-        for b in range(256):
-            if b not in b'\n"\'\\abtnvfr01234567x':
-                b = bytes([b])
-                check(b'\\' + b, b'\\' + b)
+        for i in range(97, 123):
+            b = bytes([i])
+            if b not in b'abfnrtvx':
+                with self.assertWarns(DeprecationWarning):
+                    check(b"\\" + b, b"\\" + b)
+            with self.assertWarns(DeprecationWarning):
+                check(b"\\" + b.upper(), b"\\" + b.upper())
+        with self.assertWarns(DeprecationWarning):
+            check(br"\8", b"\\8")
+        with self.assertWarns(DeprecationWarning):
+            check(br"\9", b"\\9")

    def test_errors(self):
        decode = codecs.escape_decode
@ -2448,7 +2453,6 @@ class UnicodeEscapeTest(unittest.TestCase):
        check(br"[\f]", "[\x0c]")
        check(br"[\r]", "[\x0d]")
        check(br"[\7]", "[\x07]")
-        check(br"[\8]", r"[\8]")
        check(br"[\78]", "[\x078]")
        check(br"[\41]", "[!]")
        check(br"[\418]", "[!8]")
@ -2458,9 +2462,18 @@ class UnicodeEscapeTest(unittest.TestCase):
        check(br"[\x410]", "[A0]")
        check(br"\u20ac", "\u20ac")
        check(br"\U0001d120", "\U0001d120")
-        for b in range(256):
-            if b not in b'\n"\'\\abtnvfr01234567xuUN':
-                check(b'\\' + bytes([b]), '\\' + chr(b))
+        for i in range(97, 123):
+            b = bytes([i])
+            if b not in b'abfnrtuvx':
+                with self.assertWarns(DeprecationWarning):
+                    check(b"\\" + b, "\\" + chr(i))
+            if b.upper() not in b'UN':
+                with self.assertWarns(DeprecationWarning):
+                    check(b"\\" + b.upper(), "\\" + chr(i-32))
+        with self.assertWarns(DeprecationWarning):
+            check(br"\8", "\\8")
+        with self.assertWarns(DeprecationWarning):
+            check(br"\9", "\\9")

    def test_decode_errors(self):
        decode = codecs.unicode_escape_decode
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -10,6 +10,7 @@ import codecs
 import itertools
 import operator
 import struct
+import string
 import sys
 import unittest
 import warnings
@ -2752,6 +2753,12 @@ class UnicodeTest(string_tests.CommonTest,
        support.check_free_after_iterating(self, iter, str)
        support.check_free_after_iterating(self, reversed, str)

+    def test_invalid_sequences(self):
+        for letter in string.ascii_letters + "89": # 0-7 are octal escapes
+            if letter in "abfnrtuvxNU":
+                continue
+            with self.assertWarns(DeprecationWarning):
+                eval(r"'\%s'" % letter)

 class StringModuleTest(unittest.TestCase):
    def test_formatter_parser(self):
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,9 @@ What's New in Python 3.6.0 beta 1
 Core and Builtins
 -----------------

+- Issue #27364: A backslash-character pair that is not a valid escape sequence
+  now generates a DeprecationWarning.
+
 - Issue #27350: `dict` implementation is changed like PyPy. It is more compact
  and preserves insertion order.
  (Concept developed by Raymond Hettinger and patch by Inada Naoki.)
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@ -1207,8 +1207,9 @@ PyObject *PyBytes_DecodeEscape(const char *s,
            break;

        default:
+            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, "invalid escape sequence '\\%c'", *(--s)) < 0)
+                goto failed;
            *p++ = '\\';
-            s--;
            goto non_esc; /* an arbitrary number of unescaped
                             UTF-8 bytes may follow. */
        }
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -6065,6 +6065,9 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
            goto error;

        default:
+            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                                 "invalid escape sequence '\\%c'", c) < 0)
+                goto onError;
            WRITE_ASCII_CHAR('\\');
            WRITE_CHAR(c);
            continue;