From 69e10976b2e7682c6d57f4272932ebc19f8e8859 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Tue, 8 Feb 2022 11:54:37 +0000 Subject: [PATCH] bpo-46521: Fix codeop to use a new partial-input mode of the parser (GH-31010) --- Include/cpython/compile.h | 4 +- Include/errcode.h | 2 + Lib/codeop.py | 54 ++++--------------- .../2022-02-01-19-34-28.bpo-46521.IMUIrs.rst | 2 + Parser/pegen.c | 15 +++++- Parser/pegen.h | 1 + Parser/tokenizer.c | 26 +++++---- 7 files changed, 49 insertions(+), 55 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-02-01-19-34-28.bpo-46521.IMUIrs.rst diff --git a/Include/cpython/compile.h b/Include/cpython/compile.h index a202c0b0e65..518a3764992 100644 --- a/Include/cpython/compile.h +++ b/Include/cpython/compile.h @@ -18,8 +18,10 @@ #define PyCF_IGNORE_COOKIE 0x0800 #define PyCF_TYPE_COMMENTS 0x1000 #define PyCF_ALLOW_TOP_LEVEL_AWAIT 0x2000 +#define PyCF_ALLOW_INCOMPLETE_INPUT 0x4000 #define PyCF_COMPILE_MASK (PyCF_ONLY_AST | PyCF_ALLOW_TOP_LEVEL_AWAIT | \ - PyCF_TYPE_COMMENTS | PyCF_DONT_IMPLY_DEDENT) + PyCF_TYPE_COMMENTS | PyCF_DONT_IMPLY_DEDENT | \ + PyCF_ALLOW_INCOMPLETE_INPUT) typedef struct { int cf_flags; /* bitmask of CO_xxx flags relevant to future */ diff --git a/Include/errcode.h b/Include/errcode.h index 2e07fc2c963..54ae929bf25 100644 --- a/Include/errcode.h +++ b/Include/errcode.h @@ -26,6 +26,8 @@ extern "C" { #define E_TOODEEP 20 /* Too many indentation levels */ #define E_DEDENT 21 /* No matching outer block for dedent */ #define E_DECODE 22 /* Error in decoding into Unicode */ +#define E_EOFS 23 /* EOF in triple-quoted string */ +#define E_EOLS 24 /* EOL in single-quoted string */ #define E_LINECONT 25 /* Unexpected characters after a line continuation */ #define E_BADSINGLE 27 /* Ill-formed single statement input */ #define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */ diff --git a/Lib/codeop.py b/Lib/codeop.py index 6b56be488ee..568e9bbc118 100644 --- a/Lib/codeop.py +++ b/Lib/codeop.py @@ -10,30 +10,6 @@ and: syntax error (OverflowError and ValueError can be produced by malformed literals). -Approach: - -First, check if the source consists entirely of blank lines and -comments; if so, replace it with 'pass', because the built-in -parser doesn't always do the right thing for these. - -Compile three times: as is, with \n, and with \n\n appended. If it -compiles as is, it's complete. If it compiles with one \n appended, -we expect more. If it doesn't compile either way, we compare the -error we get when compiling with \n or \n\n appended. If the errors -are the same, the code is broken. But if the errors are different, we -expect more. Not intuitive; not even guaranteed to hold in future -releases; but this matches the compiler's behavior from Python 1.4 -through 2.2, at least. - -Caveat: - -It is possible (but not likely) that the parser stops parsing with a -successful outcome before reaching the end of the source; in this -case, trailing symbols may be ignored instead of causing an error. -For example, a backslash followed by two newlines may be followed by -arbitrary garbage. This will be fixed once the API for the parser is -better. - The two interfaces are: compile_command(source, filename, symbol): @@ -64,7 +40,11 @@ _features = [getattr(__future__, fname) __all__ = ["compile_command", "Compile", "CommandCompiler"] -PyCF_DONT_IMPLY_DEDENT = 0x200 # Matches pythonrun.h. +# The following flags match the values from Include/cpython/compile.h +# Caveat emptor: These flags are undocumented on purpose and depending +# on their effect outside the standard library is **unsupported**. +PyCF_DONT_IMPLY_DEDENT = 0x200 +PyCF_ALLOW_INCOMPLETE_INPUT = 0x4000 def _maybe_compile(compiler, source, filename, symbol): # Check for source consisting of only blank lines and comments. @@ -86,24 +66,12 @@ def _maybe_compile(compiler, source, filename, symbol): with warnings.catch_warnings(): warnings.simplefilter("error") - code1 = err1 = err2 = None try: - code1 = compiler(source + "\n", filename, symbol) + compiler(source + "\n", filename, symbol) except SyntaxError as e: - err1 = e - - try: - code2 = compiler(source + "\n\n", filename, symbol) - except SyntaxError as e: - err2 = e - - try: - if not code1 and _is_syntax_error(err1, err2): - raise err1 - else: - return None - finally: - err1 = err2 = None + if "incomplete input" in str(e): + return None + raise def _is_syntax_error(err1, err2): rep1 = repr(err1) @@ -115,7 +83,7 @@ def _is_syntax_error(err1, err2): return False def _compile(source, filename, symbol): - return compile(source, filename, symbol, PyCF_DONT_IMPLY_DEDENT) + return compile(source, filename, symbol, PyCF_DONT_IMPLY_DEDENT | PyCF_ALLOW_INCOMPLETE_INPUT) def compile_command(source, filename="", symbol="single"): r"""Compile a command and determine whether it is incomplete. @@ -144,7 +112,7 @@ class Compile: statement, it "remembers" and compiles all subsequent program texts with the statement in force.""" def __init__(self): - self.flags = PyCF_DONT_IMPLY_DEDENT + self.flags = PyCF_DONT_IMPLY_DEDENT | PyCF_ALLOW_INCOMPLETE_INPUT def __call__(self, source, filename, symbol): codeob = compile(source, filename, symbol, self.flags, True) diff --git a/Misc/NEWS.d/next/Library/2022-02-01-19-34-28.bpo-46521.IMUIrs.rst b/Misc/NEWS.d/next/Library/2022-02-01-19-34-28.bpo-46521.IMUIrs.rst new file mode 100644 index 00000000000..4e9fa08d4df --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-02-01-19-34-28.bpo-46521.IMUIrs.rst @@ -0,0 +1,2 @@ +Fix a bug in the :mod:`codeop` module that was incorrectly identifying +invalid code involving string quotes as valid code. diff --git a/Parser/pegen.c b/Parser/pegen.c index 470c2cbd743..6adde843230 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -726,6 +726,9 @@ compute_parser_flags(PyCompilerFlags *flags) if ((flags->cf_flags & PyCF_ONLY_AST) && flags->cf_feature_version < 7) { parser_flags |= PyPARSE_ASYNC_HACKS; } + if (flags->cf_flags & PyCF_ALLOW_INCOMPLETE_INPUT) { + parser_flags |= PyPARSE_ALLOW_INCOMPLETE_INPUT; + } return parser_flags; } @@ -811,16 +814,26 @@ reset_parser_state_for_error_pass(Parser *p) p->tok->interactive_underflow = IUNDERFLOW_STOP; } +static inline int +_is_end_of_source(Parser *p) { + int err = p->tok->done; + return err == E_EOF || err == E_EOFS || err == E_EOLS; +} + void * _PyPegen_run_parser(Parser *p) { void *res = _PyPegen_parse(p); assert(p->level == 0); if (res == NULL) { + if ((p->flags & PyPARSE_ALLOW_INCOMPLETE_INPUT) && _is_end_of_source(p)) { + PyErr_Clear(); + return RAISE_SYNTAX_ERROR("incomplete input"); + } if (PyErr_Occurred() && !PyErr_ExceptionMatches(PyExc_SyntaxError)) { return NULL; } - // Make a second parser pass. In this pass we activate heavier and slower checks + // Make a second parser pass. In this pass we activate heavier and slower checks // to produce better error messages and more complete diagnostics. Extra "invalid_*" // rules will be active during parsing. Token *last_token = p->tokens[p->fill - 1]; diff --git a/Parser/pegen.h b/Parser/pegen.h index caba34e535b..061ca3a2013 100644 --- a/Parser/pegen.h +++ b/Parser/pegen.h @@ -22,6 +22,7 @@ #define PyPARSE_BARRY_AS_BDFL 0x0020 #define PyPARSE_TYPE_COMMENTS 0x0040 #define PyPARSE_ASYNC_HACKS 0x0080 +#define PyPARSE_ALLOW_INCOMPLETE_INPUT 0x0100 #define CURRENT_POS (-5) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 5b5cbdb809e..d38df66c69e 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -40,7 +40,7 @@ static struct tok_state *tok_new(void); static int tok_nextc(struct tok_state *tok); static void tok_backup(struct tok_state *tok, int c); - +static int syntaxerror(struct tok_state *tok, const char *format, ...); /* Spaces in this constant are treated as "zero or more spaces or tabs" when tokenizing. */ @@ -1031,8 +1031,9 @@ tok_nextc(struct tok_state *tok) if (tok->cur != tok->inp) { return Py_CHARMASK(*tok->cur++); /* Fast path */ } - if (tok->done != E_OK) - return EOF; + if (tok->done != E_OK) { + return EOF; + } if (tok->fp == NULL) { rc = tok_underflow_string(tok); } @@ -1964,16 +1965,21 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) tok->line_start = tok->multi_line_start; int start = tok->lineno; tok->lineno = tok->first_lineno; - if (quote_size == 3) { - return syntaxerror(tok, - "unterminated triple-quoted string literal" - " (detected at line %d)", start); + syntaxerror(tok, "unterminated triple-quoted string literal" + " (detected at line %d)", start); + if (c != '\n') { + tok->done = E_EOFS; + } + return ERRORTOKEN; } else { - return syntaxerror(tok, - "unterminated string literal (detected at" - " line %d)", start); + syntaxerror(tok, "unterminated string literal (detected at" + " line %d)", start); + if (c != '\n') { + tok->done = E_EOLS; + } + return ERRORTOKEN; } } if (c == quote) {