From 0b3847de6dbe451d38d8de940717a5a1f186c2e9 Mon Sep 17 00:00:00 2001 From: Christian Heimes Date: Wed, 20 Jun 2012 11:17:58 +0200 Subject: [PATCH] Issue #15096: Drop support for the ur string prefix --- Doc/reference/lexical_analysis.rst | 14 ++++++++------ Lib/test/test_strlit.py | 9 +++++++++ Lib/test/test_tokenize.py | 22 ++-------------------- Lib/tokenize.py | 12 +++--------- Misc/NEWS | 3 +++ Parser/tokenizer.c | 5 +++-- 6 files changed, 28 insertions(+), 37 deletions(-) diff --git a/Doc/reference/lexical_analysis.rst b/Doc/reference/lexical_analysis.rst index c94a47f0897..5e5903f7a92 100644 --- a/Doc/reference/lexical_analysis.rst +++ b/Doc/reference/lexical_analysis.rst @@ -401,7 +401,7 @@ String literals are described by the following lexical definitions: .. productionlist:: stringliteral: [`stringprefix`](`shortstring` | `longstring`) - stringprefix: "r" | "u" | "ur" | "R" | "U" | "UR" | "Ur" | "uR" + stringprefix: "r" | "u" | "R" | "U" shortstring: "'" `shortstringitem`* "'" | '"' `shortstringitem`* '"' longstring: "'''" `longstringitem`* "'''" | '"""' `longstringitem`* '"""' shortstringitem: `shortstringchar` | `stringescapeseq` @@ -444,19 +444,21 @@ must be expressed with escapes. As of Python 3.3 it is possible again to prefix unicode strings with a ``u`` prefix to simplify maintenance of dual 2.x and 3.x codebases. -Both string and bytes literals may optionally be prefixed with a letter ``'r'`` +Bytes literals may optionally be prefixed with a letter ``'r'`` or ``'R'``; such strings are called :dfn:`raw strings` and treat backslashes as literal characters. As a result, in string literals, ``'\U'`` and ``'\u'`` -escapes in raw strings are not treated specially. +escapes in raw strings are not treated specially. Given that Python 2.x's raw +unicode literals behave differently than Python 3.x's the ``'ur'`` syntax +is not supported. .. versionadded:: 3.3 The ``'rb'`` prefix of raw bytes literals has been added as a synonym of ``'br'``. .. versionadded:: 3.3 - Support for the unicode legacy literal (``u'value'``) and other - versions were reintroduced to simplify the maintenance of dual - Python 2.x and 3.x codebases. See :pep:`414` for more information. + Support for the unicode legacy literal (``u'value'``) was reintroduced + to simplify the maintenance of dual Python 2.x and 3.x codebases. + See :pep:`414` for more information. In triple-quoted strings, unescaped newlines and quotes are allowed (and are retained), except that three unescaped quotes in a row terminate the string. (A diff --git a/Lib/test/test_strlit.py b/Lib/test/test_strlit.py index 1f041c80abe..07bc48880a9 100644 --- a/Lib/test/test_strlit.py +++ b/Lib/test/test_strlit.py @@ -123,6 +123,15 @@ class TestLiterals(unittest.TestCase): self.assertRaises(SyntaxError, eval, """ rrb'' """) self.assertRaises(SyntaxError, eval, """ rbb'' """) + def test_eval_str_u(self): + self.assertEqual(eval(""" u'x' """), 'x') + self.assertEqual(eval(""" U'\u00e4' """), 'ä') + self.assertEqual(eval(""" u'\N{LATIN SMALL LETTER A WITH DIAERESIS}' """), 'ä') + self.assertRaises(SyntaxError, eval, """ ur'' """) + self.assertRaises(SyntaxError, eval, """ ru'' """) + self.assertRaises(SyntaxError, eval, """ bu'' """) + self.assertRaises(SyntaxError, eval, """ ub'' """) + def check_encoding(self, encoding, extra=""): modname = "xx_" + encoding.replace("-", "_") fn = os.path.join(self.tmpdir, modname + ".py") diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 4c2e4e2b677..4e798d789f6 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -299,24 +299,6 @@ String literals STRING 'u"abc"' (1, 0) (1, 6) OP '+' (1, 7) (1, 8) STRING 'U"abc"' (1, 9) (1, 15) - >>> dump_tokens("ur'abc' + uR'abc' + Ur'abc' + UR'abc'") - ENCODING 'utf-8' (0, 0) (0, 0) - STRING "ur'abc'" (1, 0) (1, 7) - OP '+' (1, 8) (1, 9) - STRING "uR'abc'" (1, 10) (1, 17) - OP '+' (1, 18) (1, 19) - STRING "Ur'abc'" (1, 20) (1, 27) - OP '+' (1, 28) (1, 29) - STRING "UR'abc'" (1, 30) (1, 37) - >>> dump_tokens('ur"abc" + uR"abc" + Ur"abc" + UR"abc"') - ENCODING 'utf-8' (0, 0) (0, 0) - STRING 'ur"abc"' (1, 0) (1, 7) - OP '+' (1, 8) (1, 9) - STRING 'uR"abc"' (1, 10) (1, 17) - OP '+' (1, 18) (1, 19) - STRING 'Ur"abc"' (1, 20) (1, 27) - OP '+' (1, 28) (1, 29) - STRING 'UR"abc"' (1, 30) (1, 37) >>> dump_tokens("b'abc' + B'abc'") ENCODING 'utf-8' (0, 0) (0, 0) @@ -642,7 +624,7 @@ Non-ascii identifiers Legacy unicode literals: - >>> dump_tokens("Örter = u'places'\\ngrün = UR'green'") + >>> dump_tokens("Örter = u'places'\\ngrün = U'green'") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'Örter' (1, 0) (1, 5) OP '=' (1, 6) (1, 7) @@ -650,7 +632,7 @@ Legacy unicode literals: NEWLINE '\\n' (1, 17) (1, 18) NAME 'grün' (2, 0) (2, 4) OP '=' (2, 5) (2, 6) - STRING "UR'green'" (2, 7) (2, 16) + STRING "U'green'" (2, 7) (2, 15) """ from test import support diff --git a/Lib/tokenize.py b/Lib/tokenize.py index e41cd6eea42..0a53435583a 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -127,7 +127,7 @@ Floatnumber = group(Pointfloat, Expfloat) Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]') Number = group(Imagnumber, Floatnumber, Intnumber) -StringPrefix = r'(?:[uUbB][rR]?|[rR][bB]?)?' +StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?' # Tail end of ' string. Single = r"[^'\\]*(?:\\.[^'\\]*)*'" @@ -183,12 +183,8 @@ endpats = {"'": Single, '"': Double, "rB'''": Single3, 'rB"""': Double3, "RB'''": Single3, 'RB"""': Double3, "u'''": Single3, 'u"""': Double3, - "ur'''": Single3, 'ur"""': Double3, "R'''": Single3, 'R"""': Double3, "U'''": Single3, 'U"""': Double3, - "uR'''": Single3, 'uR"""': Double3, - "Ur'''": Single3, 'Ur"""': Double3, - "UR'''": Single3, 'UR"""': Double3, 'r': None, 'R': None, 'b': None, 'B': None, 'u': None, 'U': None} @@ -201,8 +197,7 @@ for t in ("'''", '"""', "rb'''", 'rb"""', "rB'''", 'rB"""', "Rb'''", 'Rb"""', "RB'''", 'RB"""', "u'''", 'u"""', "U'''", 'U"""', - "ur'''", 'ur"""', "Ur'''", 'Ur"""', - "uR'''", 'uR"""', "UR'''", 'UR"""'): + ): triple_quoted[t] = t single_quoted = {} for t in ("'", '"', @@ -213,8 +208,7 @@ for t in ("'", '"', "rb'", 'rb"', "rB'", 'rB"', "Rb'", 'Rb"', "RB'", 'RB"' , "u'", 'u"', "U'", 'U"', - "ur'", 'ur"', "Ur'", 'Ur"', - "uR'", 'uR"', "UR'", 'UR"' ): + ): single_quoted[t] = t tabsize = 8 diff --git a/Misc/NEWS b/Misc/NEWS index bcf28bd1737..2f9236ff36b 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,9 @@ What's New in Python 3.3.0 Beta 1? Core and Builtins ----------------- +- Issue #15096: Removed support for ur'' as the raw notation isn't + compatible with Python 2.x's raw unicode strings. + - Issue #13783: Generator objects now use the identifier APIs internally - Issue #14874: Restore charmap decoding speed to pre-PEP 393 levels. diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 36ca0791cf1..93a4a5ccb47 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1412,7 +1412,7 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end) /* Identifier (most frequent token!) */ nonascii = 0; if (is_potential_identifier_start(c)) { - /* Process b"", r"", u"", br"", rb"" and ur"" */ + /* Process b"", r"", u"", br"" and rb"" */ int saw_b = 0, saw_r = 0, saw_u = 0; while (1) { if (!(saw_b || saw_u) && (c == 'b' || c == 'B')) @@ -1421,7 +1421,8 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end) want to support it in arbitrary order like byte literals. */ else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U')) saw_u = 1; - else if (!saw_r && (c == 'r' || c == 'R')) + /* ur"" and ru"" are not supported */ + else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) saw_r = 1; else break;