From ebebb6429c224c713e1c63a0b05d4840f52c7415 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Thu, 23 Apr 2020 18:36:06 +0300 Subject: [PATCH] bpo-40334: Improve various PEG-Parser related stuff (GH-19669) The changes in this commit are all related to @vstinner's original review comments of the initial PEP 617 implementation PR. --- Include/{ => internal}/pegen_interface.h | 6 +- Makefile.pre.in | 2 +- Modules/_peg_parser.c | 2 +- PCbuild/pythoncore.vcxproj | 2 +- Parser/pegen/peg_api.c | 2 +- Parser/pegen/pegen.c | 76 +++++++++++++++--------- Python/pythonrun.c | 2 +- 7 files changed, 58 insertions(+), 34 deletions(-) rename Include/{ => internal}/pegen_interface.h (94%) diff --git a/Include/pegen_interface.h b/Include/internal/pegen_interface.h similarity index 94% rename from Include/pegen_interface.h rename to Include/internal/pegen_interface.h index bf5b29634ac..d8621c1a889 100644 --- a/Include/pegen_interface.h +++ b/Include/internal/pegen_interface.h @@ -1,10 +1,13 @@ -#ifndef Py_LIMITED_API #ifndef Py_PEGENINTERFACE #define Py_PEGENINTERFACE #ifdef __cplusplus extern "C" { #endif +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + #include "Python.h" #include "Python-ast.h" @@ -29,4 +32,3 @@ PyAPI_FUNC(PyCodeObject *) PyPegen_CodeObjectFromFileObject(FILE *, PyObject *fi } #endif #endif /* !Py_PEGENINTERFACE*/ -#endif /* !Py_LIMITED_API */ diff --git a/Makefile.pre.in b/Makefile.pre.in index 29d7e344682..3e4b20bb60e 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -304,7 +304,7 @@ PEGEN_OBJS= \ PEGEN_HEADERS= \ - $(srcdir)/Include/pegen_interface.h \ + $(srcdir)/Include/internal/pegen_interface.h \ $(srcdir)/Parser/pegen/pegen.h \ $(srcdir)/Parser/pegen/parse_string.h diff --git a/Modules/_peg_parser.c b/Modules/_peg_parser.c index 0a84edcfc00..cb5f9aa63ae 100644 --- a/Modules/_peg_parser.c +++ b/Modules/_peg_parser.c @@ -1,5 +1,5 @@ #include -#include +#include "pegen_interface.h" PyObject * _Py_parse_file(PyObject *self, PyObject *args, PyObject *kwds) diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index d795c4d5a7d..3484f44e961 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -161,6 +161,7 @@ + @@ -213,7 +214,6 @@ - diff --git a/Parser/pegen/peg_api.c b/Parser/pegen/peg_api.c index 7c6903cdd93..c42aa680c86 100644 --- a/Parser/pegen/peg_api.c +++ b/Parser/pegen/peg_api.c @@ -1,4 +1,4 @@ -#include +#include "pegen_interface.h" #include "../tokenizer.h" #include "pegen.h" diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c index 0b70c950d88..a51c8aae8b4 100644 --- a/Parser/pegen/pegen.c +++ b/Parser/pegen/pegen.c @@ -8,6 +8,9 @@ static int init_normalization(Parser *p) { + if (p->normalize) { + return 1; + } PyObject *m = PyImport_ImportModuleNoBlock("unicodedata"); if (!m) { @@ -36,7 +39,7 @@ _PyPegen_new_identifier(Parser *p, char *n) if (!PyUnicode_IS_ASCII(id)) { PyObject *id2; - if (!p->normalize && !init_normalization(p)) + if (!init_normalization(p)) { Py_DECREF(id); goto error; @@ -88,6 +91,9 @@ static inline Py_ssize_t byte_offset_to_character_offset(PyObject *line, int col_offset) { const char *str = PyUnicode_AsUTF8(line); + if (!str) { + return 0; + } PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, NULL); if (!text) { return 0; @@ -171,9 +177,10 @@ _PyPegen_get_expr_name(expr_ty e) } } -static void +static int raise_decode_error(Parser *p) { + assert(PyErr_Occurred()); const char *errtype = NULL; if (PyErr_ExceptionMatches(PyExc_UnicodeError)) { errtype = "unicode error"; @@ -197,6 +204,8 @@ raise_decode_error(Parser *p) Py_XDECREF(value); Py_XDECREF(tback); } + + return -1; } static void @@ -207,27 +216,33 @@ raise_tokenizer_init_error(PyObject *filename) || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) { return; } - PyObject *type, *value, *tback, *errstr; + PyObject *errstr = NULL; + PyObject *tuple = NULL; + PyObject *type, *value, *tback; PyErr_Fetch(&type, &value, &tback); errstr = PyObject_Str(value); + if (!errstr) { + goto error; + } - Py_INCREF(Py_None); - PyObject *tmp = Py_BuildValue("(OiiN)", filename, 0, -1, Py_None); + PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None); if (!tmp) { goto error; } - value = PyTuple_Pack(2, errstr, tmp); + tuple = PyTuple_Pack(2, errstr, tmp); Py_DECREF(tmp); if (!value) { goto error; } - PyErr_SetObject(PyExc_SyntaxError, value); + PyErr_SetObject(PyExc_SyntaxError, tuple); error: Py_XDECREF(type); Py_XDECREF(value); Py_XDECREF(tback); + Py_XDECREF(errstr); + Py_XDECREF(tuple); } static inline PyObject * @@ -337,9 +352,6 @@ tokenizer_error(Parser *p) errtype = PyExc_IndentationError; msg = "too many levels of indentation"; break; - case E_DECODE: - raise_decode_error(p); - return -1; case E_LINECONT: msg = "unexpected character after line continuation character"; break; @@ -513,7 +525,12 @@ _PyPegen_fill_token(Parser *p) const char *start, *end; int type = PyTokenizer_Get(p->tok, &start, &end); if (type == ERRORTOKEN) { - return tokenizer_error(p); + if (p->tok->done == E_DECODE) { + return raise_decode_error(p); + } + else { + return tokenizer_error(p); + } } if (type == ENDMARKER && p->start_rule == Py_single_input && p->parsing_started) { type = NEWLINE; /* Add an extra newline */ @@ -530,13 +547,21 @@ _PyPegen_fill_token(Parser *p) if (p->fill == p->size) { int newsize = p->size * 2; - p->tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *)); - if (p->tokens == NULL) { - PyErr_Format(PyExc_MemoryError, "Realloc tokens failed"); + Token **new_tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *)); + if (new_tokens == NULL) { + PyErr_NoMemory(); return -1; } + else { + p->tokens = new_tokens; + } for (int i = p->size; i < newsize; i++) { p->tokens[i] = PyMem_Malloc(sizeof(Token)); + if (p->tokens[i] == NULL) { + p->size = i; // Needed, in order to cleanup correctly after parser fails + PyErr_NoMemory(); + return -1; + } memset(p->tokens[i], '\0', sizeof(Token)); } p->size = newsize; @@ -566,8 +591,6 @@ _PyPegen_fill_token(Parser *p) t->end_lineno = p->starting_lineno + end_lineno; t->end_col_offset = p->tok->lineno == 1 ? p->starting_col_offset + end_col_offset : end_col_offset; - // if (p->fill % 100 == 0) fprintf(stderr, "Filled at %d: %s \"%s\"\n", p->fill, - // token_name(type), PyBytes_AsString(t->bytes)); p->fill += 1; return 0; } @@ -614,6 +637,7 @@ _PyPegen_is_memoized(Parser *p, int type, void *pres) { if (p->mark == p->fill) { if (_PyPegen_fill_token(p) < 0) { + p->error_indicator = 1; return -1; } } @@ -632,11 +656,9 @@ _PyPegen_is_memoized(Parser *p, int type, void *pres) } p->mark = m->mark; *(void **)(pres) = m->node; - // fprintf(stderr, "%d < %d: memoized!\n", p->mark, p->fill); return 1; } } - // fprintf(stderr, "%d < %d: not memoized\n", p->mark, p->fill); return 0; } @@ -683,18 +705,15 @@ _PyPegen_expect_token(Parser *p, int type) { if (p->mark == p->fill) { if (_PyPegen_fill_token(p) < 0) { + p->error_indicator = 1; return NULL; } } Token *t = p->tokens[p->mark]; if (t->type != type) { - // fprintf(stderr, "No %s at %d\n", token_name(type), p->mark); return NULL; } p->mark += 1; - // fprintf(stderr, "Got %s at %d: %s\n", token_name(type), p->mark, - // PyBytes_AsString(t->bytes)); - return t; } @@ -888,8 +907,7 @@ _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int *errcode, PyArena { Parser *p = PyMem_Malloc(sizeof(Parser)); if (p == NULL) { - PyErr_Format(PyExc_MemoryError, "Out of memory for Parser"); - return NULL; + return (Parser *) PyErr_NoMemory(); } assert(tok != NULL); p->tok = tok; @@ -898,10 +916,14 @@ _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int *errcode, PyArena p->tokens = PyMem_Malloc(sizeof(Token *)); if (!p->tokens) { PyMem_Free(p); - PyErr_Format(PyExc_MemoryError, "Out of memory for tokens"); - return NULL; + return (Parser *) PyErr_NoMemory(); } p->tokens[0] = PyMem_Malloc(sizeof(Token)); + if (!p->tokens) { + PyMem_Free(p->tokens); + PyMem_Free(p); + return (Parser *) PyErr_NoMemory(); + } memset(p->tokens[0], '\0', sizeof(Token)); p->mark = 0; p->fill = 0; @@ -1187,7 +1209,7 @@ _PyPegen_seq_count_dots(asdl_seq *seq) number_of_dots += 1; break; default: - assert(current_expr->type == ELLIPSIS || current_expr->type == DOT); + Py_UNREACHABLE(); } } diff --git a/Python/pythonrun.c b/Python/pythonrun.c index e3fd3b24271..3a2fe966c08 100644 --- a/Python/pythonrun.c +++ b/Python/pythonrun.c @@ -29,7 +29,7 @@ #include "ast.h" // PyAST_FromNodeObject() #include "marshal.h" // PyMarshal_ReadLongFromFile() -#include // PyPegen_ASTFrom* +#include "pegen_interface.h" // PyPegen_ASTFrom* #ifdef MS_WINDOWS # include "malloc.h" // alloca()