diff --git a/Lib/test/test_ast.py b/Lib/test/test_ast.py index 314b360c58b..039d1c1010b 100644 --- a/Lib/test/test_ast.py +++ b/Lib/test/test_ast.py @@ -1078,8 +1078,7 @@ Module( ast.literal_eval(node) def test_literal_eval_syntax_errors(self): - msg = "unexpected character after line continuation character" - with self.assertRaisesRegex(SyntaxError, msg): + with self.assertRaisesRegex(SyntaxError, "unexpected indent"): ast.literal_eval(r''' \ (\ diff --git a/Lib/test/test_syntax.py b/Lib/test/test_syntax.py index 968d34809ce..a6ff319af2a 100644 --- a/Lib/test/test_syntax.py +++ b/Lib/test/test_syntax.py @@ -1613,6 +1613,36 @@ pass except SyntaxError: self.fail("Empty line after a line continuation character is valid.") + # See issue-46091 + s1 = r"""\ +def fib(n): + \ +'''Print a Fibonacci series up to n.''' + \ +a, b = 0, 1 +""" + s2 = r"""\ +def fib(n): + '''Print a Fibonacci series up to n.''' + a, b = 0, 1 +""" + try: + self.assertEqual(compile(s1, '', 'exec'), compile(s2, '', 'exec')) + except SyntaxError: + self.fail("Indented statement over multiple lines is valid") + + def test_continuation_bad_indentation(self): + # Check that code that breaks indentation across multiple lines raises a syntax error + + code = r"""\ +if x: + y = 1 + \ + foo = 1 + """ + + self.assertRaises(IndentationError, exec, code) + @support.cpython_only def test_nested_named_except_blocks(self): code = "" diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index ca2821de7c0..334390abaa2 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -6,6 +6,7 @@ from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, NEWLINE, _generate_tokens_from_c_tokenizer) from io import BytesIO, StringIO import unittest +from textwrap import dedent from unittest import TestCase, mock from test.test_grammar import (VALID_UNDERSCORE_LITERALS, INVALID_UNDERSCORE_LITERALS) @@ -44,7 +45,6 @@ class TokenizeTest(TestCase): # The ENDMARKER and final NEWLINE are omitted. f = BytesIO(s.encode('utf-8')) result = stringify_tokens_from_source(tokenize(f.readline), s) - self.assertEqual(result, [" ENCODING 'utf-8' (0, 0) (0, 0)"] + expected.rstrip().splitlines()) @@ -2511,7 +2511,105 @@ async def f(): self.assertRaises(SyntaxError, get_tokens, "("*1000+"a"+")"*1000) self.assertRaises(SyntaxError, get_tokens, "]") + + def test_continuation_lines_indentation(self): + def get_tokens(string): + return [(kind, string) for (kind, string, *_) in _generate_tokens_from_c_tokenizer(string)] + code = dedent(""" + def fib(n): + \\ + '''Print a Fibonacci series up to n.''' + \\ + a, b = 0, 1 + """) + + self.check_tokenize(code, """\ + NAME 'def' (2, 0) (2, 3) + NAME 'fib' (2, 4) (2, 7) + LPAR '(' (2, 7) (2, 8) + NAME 'n' (2, 8) (2, 9) + RPAR ')' (2, 9) (2, 10) + COLON ':' (2, 10) (2, 11) + NEWLINE '' (2, 11) (2, 11) + INDENT '' (4, -1) (4, -1) + STRING "'''Print a Fibonacci series up to n.'''" (4, 0) (4, 39) + NEWLINE '' (4, 39) (4, 39) + NAME 'a' (6, 0) (6, 1) + COMMA ',' (6, 1) (6, 2) + NAME 'b' (6, 3) (6, 4) + EQUAL '=' (6, 5) (6, 6) + NUMBER '0' (6, 7) (6, 8) + COMMA ',' (6, 8) (6, 9) + NUMBER '1' (6, 10) (6, 11) + NEWLINE '' (6, 11) (6, 11) + DEDENT '' (6, -1) (6, -1) + """) + + code_no_cont = dedent(""" + def fib(n): + '''Print a Fibonacci series up to n.''' + a, b = 0, 1 + """) + + self.assertEqual(get_tokens(code), get_tokens(code_no_cont)) + + code = dedent(""" + pass + \\ + + pass + """) + + self.check_tokenize(code, """\ + NAME 'pass' (2, 0) (2, 4) + NEWLINE '' (2, 4) (2, 4) + NAME 'pass' (5, 0) (5, 4) + NEWLINE '' (5, 4) (5, 4) + """) + + code_no_cont = dedent(""" + pass + pass + """) + + self.assertEqual(get_tokens(code), get_tokens(code_no_cont)) + + code = dedent(""" + if x: + y = 1 + \\ + \\ + \\ + \\ + foo = 1 + """) + + self.check_tokenize(code, """\ + NAME 'if' (2, 0) (2, 2) + NAME 'x' (2, 3) (2, 4) + COLON ':' (2, 4) (2, 5) + NEWLINE '' (2, 5) (2, 5) + INDENT '' (3, -1) (3, -1) + NAME 'y' (3, 4) (3, 5) + EQUAL '=' (3, 6) (3, 7) + NUMBER '1' (3, 8) (3, 9) + NEWLINE '' (3, 9) (3, 9) + NAME 'foo' (8, 4) (8, 7) + EQUAL '=' (8, 8) (8, 9) + NUMBER '1' (8, 10) (8, 11) + NEWLINE '' (8, 11) (8, 11) + DEDENT '' (8, -1) (8, -1) + """) + + code_no_cont = dedent(""" + if x: + y = 1 + foo = 1 + """) + + self.assertEqual(get_tokens(code), get_tokens(code_no_cont)) + if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-12-16-00-24-00.bpo-46091.rJ_e_e.rst b/Misc/NEWS.d/next/Core and Builtins/2021-12-16-00-24-00.bpo-46091.rJ_e_e.rst new file mode 100644 index 00000000000..a2eee0f3ebd --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2021-12-16-00-24-00.bpo-46091.rJ_e_e.rst @@ -0,0 +1,2 @@ +Correctly calculate indentation levels for lines with whitespace character +that are ended by line continuation characters. Patch by Pablo Galindo diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 5e35d6fa621..cd4254f8b90 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1347,6 +1347,24 @@ tok_decimal_tail(struct tok_state *tok) /* Get next token, after space stripping etc. */ +static inline int +tok_continuation_line(struct tok_state *tok) { + int c = tok_nextc(tok); + if (c != '\n') { + tok->done = E_LINECONT; + return -1; + } + c = tok_nextc(tok); + if (c == EOF) { + tok->done = E_EOF; + tok->cur = tok->inp; + return -1; + } else { + tok_backup(tok, c); + } + return c; +} + static int tok_get(struct tok_state *tok, const char **p_start, const char **p_end) { @@ -1363,6 +1381,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) int col = 0; int altcol = 0; tok->atbol = 0; + int cont_line_col = 0; for (;;) { c = tok_nextc(tok); if (c == ' ') { @@ -1375,14 +1394,23 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) else if (c == '\014') {/* Control-L (formfeed) */ col = altcol = 0; /* For Emacs users */ } + else if (c == '\\') { + // Indentation cannot be split over multiple physical lines + // using backslashes. This means that if we found a backslash + // preceded by whitespace, **the first one we find** determines + // the level of indentation of whatever comes next. + cont_line_col = cont_line_col ? cont_line_col : col; + if ((c = tok_continuation_line(tok)) == -1) { + return ERRORTOKEN; + } + } else { break; } } tok_backup(tok, c); - if (c == '#' || c == '\n' || c == '\\') { + if (c == '#' || c == '\n') { /* Lines with only whitespace and/or comments - and/or a line continuation character shouldn't affect the indentation and are not passed to the parser as NEWLINE tokens, except *totally* empty lines in interactive @@ -1403,6 +1431,8 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) may need to skip to the end of a comment */ } if (!blankline && tok->level == 0) { + col = cont_line_col ? cont_line_col : col; + altcol = cont_line_col ? cont_line_col : altcol; if (col == tok->indstack[tok->indent]) { /* No change */ if (altcol != tok->altindstack[tok->indent]) { @@ -1964,19 +1994,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) /* Line continuation */ if (c == '\\') { - c = tok_nextc(tok); - if (c != '\n') { - tok->done = E_LINECONT; + if ((c = tok_continuation_line(tok)) == -1) { return ERRORTOKEN; } - c = tok_nextc(tok); - if (c == EOF) { - tok->done = E_EOF; - tok->cur = tok->inp; - return ERRORTOKEN; - } else { - tok_backup(tok, c); - } tok->cont_line = 1; goto again; /* Read next line */ }