bpo-46521: Fix codeop to use a new partial-input mode of the parser (GH-31010)

This commit is contained in:
Pablo Galindo Salgado 2022-02-08 11:54:37 +00:00 committed by GitHub
parent 25db2b361b
commit 69e10976b2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 49 additions and 55 deletions

View File

@ -18,8 +18,10 @@
#define PyCF_IGNORE_COOKIE 0x0800 #define PyCF_IGNORE_COOKIE 0x0800
#define PyCF_TYPE_COMMENTS 0x1000 #define PyCF_TYPE_COMMENTS 0x1000
#define PyCF_ALLOW_TOP_LEVEL_AWAIT 0x2000 #define PyCF_ALLOW_TOP_LEVEL_AWAIT 0x2000
#define PyCF_ALLOW_INCOMPLETE_INPUT 0x4000
#define PyCF_COMPILE_MASK (PyCF_ONLY_AST | PyCF_ALLOW_TOP_LEVEL_AWAIT | \ #define PyCF_COMPILE_MASK (PyCF_ONLY_AST | PyCF_ALLOW_TOP_LEVEL_AWAIT | \
PyCF_TYPE_COMMENTS | PyCF_DONT_IMPLY_DEDENT) PyCF_TYPE_COMMENTS | PyCF_DONT_IMPLY_DEDENT | \
PyCF_ALLOW_INCOMPLETE_INPUT)
typedef struct { typedef struct {
int cf_flags; /* bitmask of CO_xxx flags relevant to future */ int cf_flags; /* bitmask of CO_xxx flags relevant to future */

View File

@ -26,6 +26,8 @@ extern "C" {
#define E_TOODEEP 20 /* Too many indentation levels */ #define E_TOODEEP 20 /* Too many indentation levels */
#define E_DEDENT 21 /* No matching outer block for dedent */ #define E_DEDENT 21 /* No matching outer block for dedent */
#define E_DECODE 22 /* Error in decoding into Unicode */ #define E_DECODE 22 /* Error in decoding into Unicode */
#define E_EOFS 23 /* EOF in triple-quoted string */
#define E_EOLS 24 /* EOL in single-quoted string */
#define E_LINECONT 25 /* Unexpected characters after a line continuation */ #define E_LINECONT 25 /* Unexpected characters after a line continuation */
#define E_BADSINGLE 27 /* Ill-formed single statement input */ #define E_BADSINGLE 27 /* Ill-formed single statement input */
#define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */ #define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */

View File

@ -10,30 +10,6 @@ and:
syntax error (OverflowError and ValueError can be produced by syntax error (OverflowError and ValueError can be produced by
malformed literals). malformed literals).
Approach:
First, check if the source consists entirely of blank lines and
comments; if so, replace it with 'pass', because the built-in
parser doesn't always do the right thing for these.
Compile three times: as is, with \n, and with \n\n appended. If it
compiles as is, it's complete. If it compiles with one \n appended,
we expect more. If it doesn't compile either way, we compare the
error we get when compiling with \n or \n\n appended. If the errors
are the same, the code is broken. But if the errors are different, we
expect more. Not intuitive; not even guaranteed to hold in future
releases; but this matches the compiler's behavior from Python 1.4
through 2.2, at least.
Caveat:
It is possible (but not likely) that the parser stops parsing with a
successful outcome before reaching the end of the source; in this
case, trailing symbols may be ignored instead of causing an error.
For example, a backslash followed by two newlines may be followed by
arbitrary garbage. This will be fixed once the API for the parser is
better.
The two interfaces are: The two interfaces are:
compile_command(source, filename, symbol): compile_command(source, filename, symbol):
@ -64,7 +40,11 @@ _features = [getattr(__future__, fname)
__all__ = ["compile_command", "Compile", "CommandCompiler"] __all__ = ["compile_command", "Compile", "CommandCompiler"]
PyCF_DONT_IMPLY_DEDENT = 0x200 # Matches pythonrun.h. # The following flags match the values from Include/cpython/compile.h
# Caveat emptor: These flags are undocumented on purpose and depending
# on their effect outside the standard library is **unsupported**.
PyCF_DONT_IMPLY_DEDENT = 0x200
PyCF_ALLOW_INCOMPLETE_INPUT = 0x4000
def _maybe_compile(compiler, source, filename, symbol): def _maybe_compile(compiler, source, filename, symbol):
# Check for source consisting of only blank lines and comments. # Check for source consisting of only blank lines and comments.
@ -86,24 +66,12 @@ def _maybe_compile(compiler, source, filename, symbol):
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("error") warnings.simplefilter("error")
code1 = err1 = err2 = None
try: try:
code1 = compiler(source + "\n", filename, symbol) compiler(source + "\n", filename, symbol)
except SyntaxError as e: except SyntaxError as e:
err1 = e if "incomplete input" in str(e):
try:
code2 = compiler(source + "\n\n", filename, symbol)
except SyntaxError as e:
err2 = e
try:
if not code1 and _is_syntax_error(err1, err2):
raise err1
else:
return None return None
finally: raise
err1 = err2 = None
def _is_syntax_error(err1, err2): def _is_syntax_error(err1, err2):
rep1 = repr(err1) rep1 = repr(err1)
@ -115,7 +83,7 @@ def _is_syntax_error(err1, err2):
return False return False
def _compile(source, filename, symbol): def _compile(source, filename, symbol):
return compile(source, filename, symbol, PyCF_DONT_IMPLY_DEDENT) return compile(source, filename, symbol, PyCF_DONT_IMPLY_DEDENT | PyCF_ALLOW_INCOMPLETE_INPUT)
def compile_command(source, filename="<input>", symbol="single"): def compile_command(source, filename="<input>", symbol="single"):
r"""Compile a command and determine whether it is incomplete. r"""Compile a command and determine whether it is incomplete.
@ -144,7 +112,7 @@ class Compile:
statement, it "remembers" and compiles all subsequent program texts statement, it "remembers" and compiles all subsequent program texts
with the statement in force.""" with the statement in force."""
def __init__(self): def __init__(self):
self.flags = PyCF_DONT_IMPLY_DEDENT self.flags = PyCF_DONT_IMPLY_DEDENT | PyCF_ALLOW_INCOMPLETE_INPUT
def __call__(self, source, filename, symbol): def __call__(self, source, filename, symbol):
codeob = compile(source, filename, symbol, self.flags, True) codeob = compile(source, filename, symbol, self.flags, True)

View File

@ -0,0 +1,2 @@
Fix a bug in the :mod:`codeop` module that was incorrectly identifying
invalid code involving string quotes as valid code.

View File

@ -726,6 +726,9 @@ compute_parser_flags(PyCompilerFlags *flags)
if ((flags->cf_flags & PyCF_ONLY_AST) && flags->cf_feature_version < 7) { if ((flags->cf_flags & PyCF_ONLY_AST) && flags->cf_feature_version < 7) {
parser_flags |= PyPARSE_ASYNC_HACKS; parser_flags |= PyPARSE_ASYNC_HACKS;
} }
if (flags->cf_flags & PyCF_ALLOW_INCOMPLETE_INPUT) {
parser_flags |= PyPARSE_ALLOW_INCOMPLETE_INPUT;
}
return parser_flags; return parser_flags;
} }
@ -811,12 +814,22 @@ reset_parser_state_for_error_pass(Parser *p)
p->tok->interactive_underflow = IUNDERFLOW_STOP; p->tok->interactive_underflow = IUNDERFLOW_STOP;
} }
static inline int
_is_end_of_source(Parser *p) {
int err = p->tok->done;
return err == E_EOF || err == E_EOFS || err == E_EOLS;
}
void * void *
_PyPegen_run_parser(Parser *p) _PyPegen_run_parser(Parser *p)
{ {
void *res = _PyPegen_parse(p); void *res = _PyPegen_parse(p);
assert(p->level == 0); assert(p->level == 0);
if (res == NULL) { if (res == NULL) {
if ((p->flags & PyPARSE_ALLOW_INCOMPLETE_INPUT) && _is_end_of_source(p)) {
PyErr_Clear();
return RAISE_SYNTAX_ERROR("incomplete input");
}
if (PyErr_Occurred() && !PyErr_ExceptionMatches(PyExc_SyntaxError)) { if (PyErr_Occurred() && !PyErr_ExceptionMatches(PyExc_SyntaxError)) {
return NULL; return NULL;
} }

View File

@ -22,6 +22,7 @@
#define PyPARSE_BARRY_AS_BDFL 0x0020 #define PyPARSE_BARRY_AS_BDFL 0x0020
#define PyPARSE_TYPE_COMMENTS 0x0040 #define PyPARSE_TYPE_COMMENTS 0x0040
#define PyPARSE_ASYNC_HACKS 0x0080 #define PyPARSE_ASYNC_HACKS 0x0080
#define PyPARSE_ALLOW_INCOMPLETE_INPUT 0x0100
#define CURRENT_POS (-5) #define CURRENT_POS (-5)

View File

@ -40,7 +40,7 @@
static struct tok_state *tok_new(void); static struct tok_state *tok_new(void);
static int tok_nextc(struct tok_state *tok); static int tok_nextc(struct tok_state *tok);
static void tok_backup(struct tok_state *tok, int c); static void tok_backup(struct tok_state *tok, int c);
static int syntaxerror(struct tok_state *tok, const char *format, ...);
/* Spaces in this constant are treated as "zero or more spaces or tabs" when /* Spaces in this constant are treated as "zero or more spaces or tabs" when
tokenizing. */ tokenizing. */
@ -1031,8 +1031,9 @@ tok_nextc(struct tok_state *tok)
if (tok->cur != tok->inp) { if (tok->cur != tok->inp) {
return Py_CHARMASK(*tok->cur++); /* Fast path */ return Py_CHARMASK(*tok->cur++); /* Fast path */
} }
if (tok->done != E_OK) if (tok->done != E_OK) {
return EOF; return EOF;
}
if (tok->fp == NULL) { if (tok->fp == NULL) {
rc = tok_underflow_string(tok); rc = tok_underflow_string(tok);
} }
@ -1964,16 +1965,21 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
tok->line_start = tok->multi_line_start; tok->line_start = tok->multi_line_start;
int start = tok->lineno; int start = tok->lineno;
tok->lineno = tok->first_lineno; tok->lineno = tok->first_lineno;
if (quote_size == 3) { if (quote_size == 3) {
return syntaxerror(tok, syntaxerror(tok, "unterminated triple-quoted string literal"
"unterminated triple-quoted string literal"
" (detected at line %d)", start); " (detected at line %d)", start);
if (c != '\n') {
tok->done = E_EOFS;
}
return ERRORTOKEN;
} }
else { else {
return syntaxerror(tok, syntaxerror(tok, "unterminated string literal (detected at"
"unterminated string literal (detected at"
" line %d)", start); " line %d)", start);
if (c != '\n') {
tok->done = E_EOLS;
}
return ERRORTOKEN;
} }
} }
if (c == quote) { if (c == quote) {