Issue #18037: 2to3 now escapes '\u' and '\U' in native strings.

2013-10-03 12:08:38 +03:00 · 2013-10-03 12:08:38 +03:00 · def0a4c298
parent 2a8b3f26b9
commit def0a4c298
3 changed files with 64 additions and 7 deletions
--- a/Lib/lib2to3/fixes/fix_unicode.py
+++ b/Lib/lib2to3/fixes/fix_unicode.py
@ -1,25 +1,43 @@
-"""Fixer that changes unicode to str, unichr to chr, and u"..." into "...".
+r"""Fixer for unicode.
+
+* Changes unicode to str and unichr to chr.
+
+* If "...\u..." is not unicode literal change it into "...\\u...".
+
+* Change u"..." into "...".

 """

-import re
 from ..pgen2 import token
 from .. import fixer_base

 _mapping = {"unichr" : "chr", "unicode" : "str"}
-_literal_re = re.compile(r"[uU][rR]?[\'\"]")

 class FixUnicode(fixer_base.BaseFix):
    BM_compatible = True
    PATTERN = "STRING | 'unicode' | 'unichr'"

+    def start_tree(self, tree, filename):
+        super(FixUnicode, self).start_tree(tree, filename)
+        self.unicode_literals = 'unicode_literals' in tree.future_features
+
    def transform(self, node, results):
        if node.type == token.NAME:
            new = node.clone()
            new.value = _mapping[node.value]
            return new
        elif node.type == token.STRING:
-            if _literal_re.match(node.value):
-                new = node.clone()
-                new.value = new.value[1:]
-                return new
+            val = node.value
+            if (not self.unicode_literals and val[0] in 'rR\'"' and
+                '\\' in val):
+                val = r'\\'.join([
+                    v.replace('\\u', r'\\u').replace('\\U', r'\\U')
+                    for v in val.split(r'\\')
+                ])
+            if val[0] in 'uU':
+                val = val[1:]
+            if val == node.value:
+                return node
+            new = node.clone()
+            new.value = val
+            return new
--- a/Lib/lib2to3/tests/test_fixers.py
+++ b/Lib/lib2to3/tests/test_fixers.py
@ -2824,6 +2824,43 @@ class Test_unicode(FixerTestCase):
        a = """R'''x''' """
        self.check(b, a)

+    def test_native_literal_escape_u(self):
+        b = r"""'\\\u20ac\U0001d121\\u20ac'"""
+        a = r"""'\\\\u20ac\\U0001d121\\u20ac'"""
+        self.check(b, a)
+
+        b = r"""r'\\\u20ac\U0001d121\\u20ac'"""
+        a = r"""r'\\\\u20ac\\U0001d121\\u20ac'"""
+        self.check(b, a)
+
+    def test_bytes_literal_escape_u(self):
+        b = r"""b'\\\u20ac\U0001d121\\u20ac'"""
+        a = r"""b'\\\u20ac\U0001d121\\u20ac'"""
+        self.check(b, a)
+
+        b = r"""br'\\\u20ac\U0001d121\\u20ac'"""
+        a = r"""br'\\\u20ac\U0001d121\\u20ac'"""
+        self.check(b, a)
+
+    def test_unicode_literal_escape_u(self):
+        b = r"""u'\\\u20ac\U0001d121\\u20ac'"""
+        a = r"""'\\\u20ac\U0001d121\\u20ac'"""
+        self.check(b, a)
+
+        b = r"""ur'\\\u20ac\U0001d121\\u20ac'"""
+        a = r"""r'\\\u20ac\U0001d121\\u20ac'"""
+        self.check(b, a)
+
+    def test_native_unicode_literal_escape_u(self):
+        f = 'from __future__ import unicode_literals\n'
+        b = f + r"""'\\\u20ac\U0001d121\\u20ac'"""
+        a = f + r"""'\\\u20ac\U0001d121\\u20ac'"""
+        self.check(b, a)
+
+        b = f + r"""r'\\\u20ac\U0001d121\\u20ac'"""
+        a = f + r"""r'\\\u20ac\U0001d121\\u20ac'"""
+        self.check(b, a)
+
 class Test_callable(FixerTestCase):
    fixer = "callable"

--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -71,6 +71,8 @@ Core and Builtins
 Library
 -------

+- Issue #18037: 2to3 now escapes '\u' and '\U' in native strings.
+
 - Issue #19137: The pprint module now correctly formats instances of set and
  frozenset subclasses.