From ebebb6429c224c713e1c63a0b05d4840f52c7415 Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Thu, 23 Apr 2020 18:36:06 +0300
Subject: [PATCH] bpo-40334: Improve various PEG-Parser related stuff
 (GH-19669)

The changes in this commit are all related to @vstinner's original review comments of the initial PEP 617 implementation PR.
---
 Include/{ => internal}/pegen_interface.h |  6 +-
 Makefile.pre.in                          |  2 +-
 Modules/_peg_parser.c                    |  2 +-
 PCbuild/pythoncore.vcxproj               |  2 +-
 Parser/pegen/peg_api.c                   |  2 +-
 Parser/pegen/pegen.c                     | 76 +++++++++++++++---------
 Python/pythonrun.c                       |  2 +-
 7 files changed, 58 insertions(+), 34 deletions(-)
 rename Include/{ => internal}/pegen_interface.h (94%)
diff --git a/Include/pegen_interface.h b/Include/internal/pegen_interface.h
similarity index 94%
rename from Include/pegen_interface.h
rename to Include/internal/pegen_interface.h
index bf5b29634ac..d8621c1a889 100644
--- a/Include/pegen_interface.h
+++ b/Include/internal/pegen_interface.h
@@ -1,10 +1,13 @@
-#ifndef Py_LIMITED_API
 #ifndef Py_PEGENINTERFACE
 #define Py_PEGENINTERFACE
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
 #include "Python.h"
 #include "Python-ast.h"
 
@@ -29,4 +32,3 @@ PyAPI_FUNC(PyCodeObject *) PyPegen_CodeObjectFromFileObject(FILE *, PyObject *fi
 }
 #endif
 #endif /* !Py_PEGENINTERFACE*/
-#endif /* !Py_LIMITED_API */
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 29d7e344682..3e4b20bb60e 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -304,7 +304,7 @@ PEGEN_OBJS=		\
 
 
 PEGEN_HEADERS= \
-		$(srcdir)/Include/pegen_interface.h \
+		$(srcdir)/Include/internal/pegen_interface.h \
 		$(srcdir)/Parser/pegen/pegen.h \
 		$(srcdir)/Parser/pegen/parse_string.h
 
diff --git a/Modules/_peg_parser.c b/Modules/_peg_parser.c
index 0a84edcfc00..cb5f9aa63ae 100644
--- a/Modules/_peg_parser.c
+++ b/Modules/_peg_parser.c
@@ -1,5 +1,5 @@
 #include <Python.h>
-#include <pegen_interface.h>
+#include "pegen_interface.h"
 
 PyObject *
 _Py_parse_file(PyObject *self, PyObject *args, PyObject *kwds)
diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
index d795c4d5a7d..3484f44e961 100644
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -161,6 +161,7 @@
     <ClInclude Include="..\Include\graminit.h" />
     <ClInclude Include="..\Include\grammar.h" />
     <ClInclude Include="..\Include\import.h" />
+    <ClInclude Include="..\Include\internal\pegen_interface.h" />
     <ClInclude Include="..\Include\internal\pycore_abstract.h" />
     <ClInclude Include="..\Include\internal\pycore_accu.h" />
     <ClInclude Include="..\Include\internal\pycore_atomic.h" />
@@ -213,7 +214,6 @@
     <ClInclude Include="..\Include\parsetok.h" />
     <ClInclude Include="..\Include\patchlevel.h" />
     <ClInclude Include="..\Include\picklebufobject.h" />
-    <ClInclude Include="..\Include\pegen_interface.h" />
     <ClInclude Include="..\Include\pyhash.h" />
     <ClInclude Include="..\Include\pyhash.h" />
     <ClInclude Include="..\Include\py_curses.h" />
diff --git a/Parser/pegen/peg_api.c b/Parser/pegen/peg_api.c
index 7c6903cdd93..c42aa680c86 100644
--- a/Parser/pegen/peg_api.c
+++ b/Parser/pegen/peg_api.c
@@ -1,4 +1,4 @@
-#include <pegen_interface.h>
+#include "pegen_interface.h"
 
 #include "../tokenizer.h"
 #include "pegen.h"
diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c
index 0b70c950d88..a51c8aae8b4 100644
--- a/Parser/pegen/pegen.c
+++ b/Parser/pegen/pegen.c
@@ -8,6 +8,9 @@
 static int
 init_normalization(Parser *p)
 {
+    if (p->normalize) {
+        return 1;
+    }
     PyObject *m = PyImport_ImportModuleNoBlock("unicodedata");
     if (!m)
     {
@@ -36,7 +39,7 @@ _PyPegen_new_identifier(Parser *p, char *n)
     if (!PyUnicode_IS_ASCII(id))
     {
         PyObject *id2;
-        if (!p->normalize && !init_normalization(p))
+        if (!init_normalization(p))
         {
             Py_DECREF(id);
             goto error;
@@ -88,6 +91,9 @@ static inline Py_ssize_t
 byte_offset_to_character_offset(PyObject *line, int col_offset)
 {
     const char *str = PyUnicode_AsUTF8(line);
+    if (!str) {
+        return 0;
+    }
     PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, NULL);
     if (!text) {
         return 0;
@@ -171,9 +177,10 @@ _PyPegen_get_expr_name(expr_ty e)
     }
 }
 
-static void
+static int
 raise_decode_error(Parser *p)
 {
+    assert(PyErr_Occurred());
     const char *errtype = NULL;
     if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
         errtype = "unicode error";
@@ -197,6 +204,8 @@ raise_decode_error(Parser *p)
         Py_XDECREF(value);
         Py_XDECREF(tback);
     }
+
+    return -1;
 }
 
 static void
@@ -207,27 +216,33 @@ raise_tokenizer_init_error(PyObject *filename)
           || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
         return;
     }
-    PyObject *type, *value, *tback, *errstr;
+    PyObject *errstr = NULL;
+    PyObject *tuple = NULL;
+    PyObject *type, *value, *tback;
     PyErr_Fetch(&type, &value, &tback);
     errstr = PyObject_Str(value);
+    if (!errstr) {
+        goto error;
+    }
 
-    Py_INCREF(Py_None);
-    PyObject *tmp = Py_BuildValue("(OiiN)", filename, 0, -1, Py_None);
+    PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
     if (!tmp) {
         goto error;
     }
 
-    value = PyTuple_Pack(2, errstr, tmp);
+    tuple = PyTuple_Pack(2, errstr, tmp);
     Py_DECREF(tmp);
     if (!value) {
         goto error;
     }
-    PyErr_SetObject(PyExc_SyntaxError, value);
+    PyErr_SetObject(PyExc_SyntaxError, tuple);
 
 error:
     Py_XDECREF(type);
     Py_XDECREF(value);
     Py_XDECREF(tback);
+    Py_XDECREF(errstr);
+    Py_XDECREF(tuple);
 }
 
 static inline PyObject *
@@ -337,9 +352,6 @@ tokenizer_error(Parser *p)
             errtype = PyExc_IndentationError;
             msg = "too many levels of indentation";
             break;
-        case E_DECODE:
-            raise_decode_error(p);
-            return -1;
         case E_LINECONT:
             msg = "unexpected character after line continuation character";
             break;
@@ -513,7 +525,12 @@ _PyPegen_fill_token(Parser *p)
     const char *start, *end;
     int type = PyTokenizer_Get(p->tok, &start, &end);
     if (type == ERRORTOKEN) {
-        return tokenizer_error(p);
+        if (p->tok->done == E_DECODE) {
+            return raise_decode_error(p);
+        }
+        else {
+            return tokenizer_error(p);
+        }
     }
     if (type == ENDMARKER && p->start_rule == Py_single_input && p->parsing_started) {
         type = NEWLINE; /* Add an extra newline */
@@ -530,13 +547,21 @@ _PyPegen_fill_token(Parser *p)
 
     if (p->fill == p->size) {
         int newsize = p->size * 2;
-        p->tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *));
-        if (p->tokens == NULL) {
-            PyErr_Format(PyExc_MemoryError, "Realloc tokens failed");
+        Token **new_tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *));
+        if (new_tokens == NULL) {
+            PyErr_NoMemory();
             return -1;
         }
+        else {
+            p->tokens = new_tokens;
+        }
         for (int i = p->size; i < newsize; i++) {
             p->tokens[i] = PyMem_Malloc(sizeof(Token));
+            if (p->tokens[i] == NULL) {
+                p->size = i; // Needed, in order to cleanup correctly after parser fails
+                PyErr_NoMemory();
+                return -1;
+            }
             memset(p->tokens[i], '\0', sizeof(Token));
         }
         p->size = newsize;
@@ -566,8 +591,6 @@ _PyPegen_fill_token(Parser *p)
     t->end_lineno = p->starting_lineno + end_lineno;
     t->end_col_offset = p->tok->lineno == 1 ? p->starting_col_offset + end_col_offset : end_col_offset;
 
-    // if (p->fill % 100 == 0) fprintf(stderr, "Filled at %d: %s \"%s\"\n", p->fill,
-    // token_name(type), PyBytes_AsString(t->bytes));
     p->fill += 1;
     return 0;
 }
@@ -614,6 +637,7 @@ _PyPegen_is_memoized(Parser *p, int type, void *pres)
 {
     if (p->mark == p->fill) {
         if (_PyPegen_fill_token(p) < 0) {
+            p->error_indicator = 1;
             return -1;
         }
     }
@@ -632,11 +656,9 @@ _PyPegen_is_memoized(Parser *p, int type, void *pres)
             }
             p->mark = m->mark;
             *(void **)(pres) = m->node;
-            // fprintf(stderr, "%d < %d: memoized!\n", p->mark, p->fill);
             return 1;
         }
     }
-    // fprintf(stderr, "%d < %d: not memoized\n", p->mark, p->fill);
     return 0;
 }
 
@@ -683,18 +705,15 @@ _PyPegen_expect_token(Parser *p, int type)
 {
     if (p->mark == p->fill) {
         if (_PyPegen_fill_token(p) < 0) {
+            p->error_indicator = 1;
             return NULL;
         }
     }
     Token *t = p->tokens[p->mark];
     if (t->type != type) {
-        // fprintf(stderr, "No %s at %d\n", token_name(type), p->mark);
         return NULL;
     }
     p->mark += 1;
-    // fprintf(stderr, "Got %s at %d: %s\n", token_name(type), p->mark,
-    // PyBytes_AsString(t->bytes));
-
     return t;
 }
 
@@ -888,8 +907,7 @@ _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int *errcode, PyArena
 {
     Parser *p = PyMem_Malloc(sizeof(Parser));
     if (p == NULL) {
-        PyErr_Format(PyExc_MemoryError, "Out of memory for Parser");
-        return NULL;
+        return (Parser *) PyErr_NoMemory();
     }
     assert(tok != NULL);
     p->tok = tok;
@@ -898,10 +916,14 @@ _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int *errcode, PyArena
     p->tokens = PyMem_Malloc(sizeof(Token *));
     if (!p->tokens) {
         PyMem_Free(p);
-        PyErr_Format(PyExc_MemoryError, "Out of memory for tokens");
-        return NULL;
+        return (Parser *) PyErr_NoMemory();
     }
     p->tokens[0] = PyMem_Malloc(sizeof(Token));
+    if (!p->tokens) {
+        PyMem_Free(p->tokens);
+        PyMem_Free(p);
+        return (Parser *) PyErr_NoMemory();
+    }
     memset(p->tokens[0], '\0', sizeof(Token));
     p->mark = 0;
     p->fill = 0;
@@ -1187,7 +1209,7 @@ _PyPegen_seq_count_dots(asdl_seq *seq)
                 number_of_dots += 1;
                 break;
             default:
-                assert(current_expr->type == ELLIPSIS || current_expr->type == DOT);
+                Py_UNREACHABLE();
         }
     }
 
diff --git a/Python/pythonrun.c b/Python/pythonrun.c
index e3fd3b24271..3a2fe966c08 100644
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@@ -29,7 +29,7 @@
 #include "ast.h"                  // PyAST_FromNodeObject()
 #include "marshal.h"              // PyMarshal_ReadLongFromFile()
 
-#include <pegen_interface.h>      // PyPegen_ASTFrom*
+#include "pegen_interface.h"      // PyPegen_ASTFrom*
 
 #ifdef MS_WINDOWS
 #  include "malloc.h"             // alloca()