2020-04-22 19:29:27 -03:00
|
|
|
#include <Python.h>
|
2021-04-29 02:58:44 -03:00
|
|
|
#include "pycore_ast.h" // _PyAST_Validate(),
|
2022-09-02 13:35:08 -03:00
|
|
|
#include "pycore_pystate.h" // _PyThreadState_GET()
|
2024-06-24 09:08:12 -03:00
|
|
|
#include "pycore_pyerrors.h" // PyExc_IncompleteInputError
|
2020-04-22 19:29:27 -03:00
|
|
|
#include <errcode.h>
|
|
|
|
|
2023-10-11 12:14:44 -03:00
|
|
|
#include "lexer/lexer.h"
|
|
|
|
#include "tokenizer/tokenizer.h"
|
2020-04-22 19:29:27 -03:00
|
|
|
#include "pegen.h"
|
|
|
|
|
2021-11-20 21:08:50 -04:00
|
|
|
// Internal parser functions
|
2020-04-22 19:29:27 -03:00
|
|
|
|
2021-11-20 21:08:50 -04:00
|
|
|
asdl_stmt_seq*
|
|
|
|
_PyPegen_interactive_exit(Parser *p)
|
2020-05-13 16:36:27 -03:00
|
|
|
{
|
2021-11-20 21:08:50 -04:00
|
|
|
if (p->errcode) {
|
|
|
|
*(p->errcode) = E_EOF;
|
bpo-43914: Highlight invalid ranges in SyntaxErrors (#25525)
To improve the user experience understanding what part of the error messages associated with SyntaxErrors is wrong, we can highlight the whole error range and not only place the caret at the first character. In this way:
>>> foo(x, z for z in range(10), t, w)
File "<stdin>", line 1
foo(x, z for z in range(10), t, w)
^
SyntaxError: Generator expression must be parenthesized
becomes
>>> foo(x, z for z in range(10), t, w)
File "<stdin>", line 1
foo(x, z for z in range(10), t, w)
^^^^^^^^^^^^^^^^^^^^
SyntaxError: Generator expression must be parenthesized
2021-04-23 10:27:05 -03:00
|
|
|
}
|
2020-05-13 16:36:27 -03:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2024-05-28 16:17:49 -03:00
|
|
|
Py_ssize_t
|
|
|
|
_PyPegen_byte_offset_to_character_offset_line(PyObject *line, Py_ssize_t col_offset, Py_ssize_t end_col_offset)
|
|
|
|
{
|
2024-09-17 12:58:43 -03:00
|
|
|
const unsigned char *data = (const unsigned char*)PyUnicode_AsUTF8(line);
|
2024-05-28 16:17:49 -03:00
|
|
|
|
|
|
|
Py_ssize_t len = 0;
|
|
|
|
while (col_offset < end_col_offset) {
|
|
|
|
Py_UCS4 ch = data[col_offset];
|
|
|
|
if (ch < 0x80) {
|
|
|
|
col_offset += 1;
|
|
|
|
} else if ((ch & 0xe0) == 0xc0) {
|
|
|
|
col_offset += 2;
|
|
|
|
} else if ((ch & 0xf0) == 0xe0) {
|
|
|
|
col_offset += 3;
|
|
|
|
} else if ((ch & 0xf8) == 0xf0) {
|
|
|
|
col_offset += 4;
|
|
|
|
} else {
|
|
|
|
PyErr_SetString(PyExc_ValueError, "Invalid UTF-8 sequence");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
len++;
|
|
|
|
}
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
2021-07-04 20:14:33 -03:00
|
|
|
Py_ssize_t
|
2023-12-11 07:44:22 -04:00
|
|
|
_PyPegen_byte_offset_to_character_offset_raw(const char* str, Py_ssize_t col_offset)
|
2021-07-04 20:14:33 -03:00
|
|
|
{
|
2024-09-17 12:58:43 -03:00
|
|
|
Py_ssize_t len = (Py_ssize_t)strlen(str);
|
2021-07-04 20:14:33 -03:00
|
|
|
if (col_offset > len + 1) {
|
|
|
|
col_offset = len + 1;
|
|
|
|
}
|
|
|
|
assert(col_offset >= 0);
|
|
|
|
PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, "replace");
|
|
|
|
if (!text) {
|
2021-07-20 12:42:12 -03:00
|
|
|
return -1;
|
2021-07-04 20:14:33 -03:00
|
|
|
}
|
|
|
|
Py_ssize_t size = PyUnicode_GET_LENGTH(text);
|
|
|
|
Py_DECREF(text);
|
|
|
|
return size;
|
|
|
|
}
|
|
|
|
|
2023-12-11 07:44:22 -04:00
|
|
|
Py_ssize_t
|
|
|
|
_PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
|
|
|
|
{
|
|
|
|
const char *str = PyUnicode_AsUTF8(line);
|
|
|
|
if (!str) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return _PyPegen_byte_offset_to_character_offset_raw(str, col_offset);
|
|
|
|
}
|
|
|
|
|
2020-04-22 19:29:27 -03:00
|
|
|
// Here, mark is the start of the node, while p->mark is the end.
|
|
|
|
// If node==NULL, they should be the same.
|
|
|
|
int
|
|
|
|
_PyPegen_insert_memo(Parser *p, int mark, int type, void *node)
|
|
|
|
{
|
|
|
|
// Insert in front
|
2021-03-23 22:23:01 -03:00
|
|
|
Memo *m = _PyArena_Malloc(p->arena, sizeof(Memo));
|
2020-04-22 19:29:27 -03:00
|
|
|
if (m == NULL) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
m->type = type;
|
|
|
|
m->node = node;
|
|
|
|
m->mark = p->mark;
|
|
|
|
m->next = p->tokens[mark]->memo;
|
|
|
|
p->tokens[mark]->memo = m;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Like _PyPegen_insert_memo(), but updates an existing node if found.
|
|
|
|
int
|
|
|
|
_PyPegen_update_memo(Parser *p, int mark, int type, void *node)
|
|
|
|
{
|
|
|
|
for (Memo *m = p->tokens[mark]->memo; m != NULL; m = m->next) {
|
|
|
|
if (m->type == type) {
|
|
|
|
// Update existing node.
|
|
|
|
m->node = node;
|
|
|
|
m->mark = p->mark;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Insert new node.
|
|
|
|
return _PyPegen_insert_memo(p, mark, type, node);
|
|
|
|
}
|
|
|
|
|
2021-11-20 21:08:50 -04:00
|
|
|
static int
|
|
|
|
init_normalization(Parser *p)
|
2020-04-22 19:29:27 -03:00
|
|
|
{
|
2021-11-20 21:08:50 -04:00
|
|
|
if (p->normalize) {
|
|
|
|
return 1;
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
2022-06-14 01:15:26 -03:00
|
|
|
p->normalize = _PyImport_GetModuleAttrString("unicodedata", "normalize");
|
2021-11-20 21:08:50 -04:00
|
|
|
if (!p->normalize)
|
|
|
|
{
|
|
|
|
return 0;
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
2021-11-20 21:08:50 -04:00
|
|
|
return 1;
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
|
|
|
|
2020-04-30 16:12:19 -03:00
|
|
|
static int
|
|
|
|
growable_comment_array_init(growable_comment_array *arr, size_t initial_size) {
|
|
|
|
assert(initial_size > 0);
|
|
|
|
arr->items = PyMem_Malloc(initial_size * sizeof(*arr->items));
|
|
|
|
arr->size = initial_size;
|
|
|
|
arr->num_items = 0;
|
|
|
|
|
|
|
|
return arr->items != NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
growable_comment_array_add(growable_comment_array *arr, int lineno, char *comment) {
|
|
|
|
if (arr->num_items >= arr->size) {
|
|
|
|
size_t new_size = arr->size * 2;
|
|
|
|
void *new_items_array = PyMem_Realloc(arr->items, new_size * sizeof(*arr->items));
|
|
|
|
if (!new_items_array) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
arr->items = new_items_array;
|
|
|
|
arr->size = new_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
arr->items[arr->num_items].lineno = lineno;
|
|
|
|
arr->items[arr->num_items].comment = comment; // Take ownership
|
|
|
|
arr->num_items++;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
growable_comment_array_deallocate(growable_comment_array *arr) {
|
|
|
|
for (unsigned i = 0; i < arr->num_items; i++) {
|
|
|
|
PyMem_Free(arr->items[i].comment);
|
|
|
|
}
|
|
|
|
PyMem_Free(arr->items);
|
|
|
|
}
|
|
|
|
|
2021-11-20 21:08:50 -04:00
|
|
|
static int
|
2022-10-06 20:07:17 -03:00
|
|
|
_get_keyword_or_name_type(Parser *p, struct token *new_token)
|
2021-11-20 21:08:50 -04:00
|
|
|
{
|
2024-09-17 12:58:43 -03:00
|
|
|
Py_ssize_t name_len = new_token->end_col_offset - new_token->col_offset;
|
2021-11-20 21:08:50 -04:00
|
|
|
assert(name_len > 0);
|
2022-10-06 20:07:17 -03:00
|
|
|
|
2021-11-20 21:08:50 -04:00
|
|
|
if (name_len >= p->n_keyword_lists ||
|
|
|
|
p->keywords[name_len] == NULL ||
|
|
|
|
p->keywords[name_len]->type == -1) {
|
|
|
|
return NAME;
|
|
|
|
}
|
|
|
|
for (KeywordToken *k = p->keywords[name_len]; k != NULL && k->type != -1; k++) {
|
2024-09-17 12:58:43 -03:00
|
|
|
if (strncmp(k->str, new_token->start, (size_t)name_len) == 0) {
|
2021-11-20 21:08:50 -04:00
|
|
|
return k->type;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return NAME;
|
|
|
|
}
|
|
|
|
|
2021-04-08 21:32:25 -03:00
|
|
|
static int
|
2022-10-06 20:07:17 -03:00
|
|
|
initialize_token(Parser *p, Token *parser_token, struct token *new_token, int token_type) {
|
|
|
|
assert(parser_token != NULL);
|
2021-04-08 21:32:25 -03:00
|
|
|
|
2022-10-06 20:07:17 -03:00
|
|
|
parser_token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, new_token) : token_type;
|
|
|
|
parser_token->bytes = PyBytes_FromStringAndSize(new_token->start, new_token->end - new_token->start);
|
|
|
|
if (parser_token->bytes == NULL) {
|
2021-04-08 21:32:25 -03:00
|
|
|
return -1;
|
|
|
|
}
|
2022-10-06 20:07:17 -03:00
|
|
|
if (_PyArena_AddPyObject(p->arena, parser_token->bytes) < 0) {
|
|
|
|
Py_DECREF(parser_token->bytes);
|
2021-04-08 21:32:25 -03:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2023-04-26 22:33:31 -03:00
|
|
|
parser_token->metadata = NULL;
|
|
|
|
if (new_token->metadata != NULL) {
|
|
|
|
if (_PyArena_AddPyObject(p->arena, new_token->metadata) < 0) {
|
|
|
|
Py_DECREF(parser_token->metadata);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
parser_token->metadata = new_token->metadata;
|
|
|
|
new_token->metadata = NULL;
|
|
|
|
}
|
|
|
|
|
2022-10-06 20:07:17 -03:00
|
|
|
parser_token->level = new_token->level;
|
|
|
|
parser_token->lineno = new_token->lineno;
|
|
|
|
parser_token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->col_offset
|
|
|
|
: new_token->col_offset;
|
|
|
|
parser_token->end_lineno = new_token->end_lineno;
|
|
|
|
parser_token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->end_col_offset
|
|
|
|
: new_token->end_col_offset;
|
2021-04-08 21:32:25 -03:00
|
|
|
|
|
|
|
p->fill += 1;
|
|
|
|
|
|
|
|
if (token_type == ERRORTOKEN && p->tok->done == E_DECODE) {
|
2021-11-20 21:08:50 -04:00
|
|
|
return _Pypegen_raise_decode_error(p);
|
2021-04-08 21:32:25 -03:00
|
|
|
}
|
|
|
|
|
2021-11-20 21:08:50 -04:00
|
|
|
return (token_type == ERRORTOKEN ? _Pypegen_tokenizer_error(p) : 0);
|
2021-04-08 21:32:25 -03:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
_resize_tokens_array(Parser *p) {
|
|
|
|
int newsize = p->size * 2;
|
2024-09-17 12:58:43 -03:00
|
|
|
Token **new_tokens = PyMem_Realloc(p->tokens, (size_t)newsize * sizeof(Token *));
|
2021-04-08 21:32:25 -03:00
|
|
|
if (new_tokens == NULL) {
|
|
|
|
PyErr_NoMemory();
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
p->tokens = new_tokens;
|
|
|
|
|
|
|
|
for (int i = p->size; i < newsize; i++) {
|
|
|
|
p->tokens[i] = PyMem_Calloc(1, sizeof(Token));
|
|
|
|
if (p->tokens[i] == NULL) {
|
|
|
|
p->size = i; // Needed, in order to cleanup correctly after parser fails
|
|
|
|
PyErr_NoMemory();
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
p->size = newsize;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-04-22 19:29:27 -03:00
|
|
|
int
|
|
|
|
_PyPegen_fill_token(Parser *p)
|
|
|
|
{
|
2022-10-06 20:07:17 -03:00
|
|
|
struct token new_token;
|
2023-05-20 21:03:02 -03:00
|
|
|
_PyToken_Init(&new_token);
|
2022-10-06 20:07:17 -03:00
|
|
|
int type = _PyTokenizer_Get(p->tok, &new_token);
|
2020-04-30 16:12:19 -03:00
|
|
|
|
|
|
|
// Record and skip '# type: ignore' comments
|
|
|
|
while (type == TYPE_IGNORE) {
|
2022-10-06 20:07:17 -03:00
|
|
|
Py_ssize_t len = new_token.end_col_offset - new_token.col_offset;
|
2024-09-17 12:58:43 -03:00
|
|
|
char *tag = PyMem_Malloc((size_t)len + 1);
|
2020-04-30 16:12:19 -03:00
|
|
|
if (tag == NULL) {
|
|
|
|
PyErr_NoMemory();
|
2023-04-26 22:33:31 -03:00
|
|
|
goto error;
|
2020-04-30 16:12:19 -03:00
|
|
|
}
|
2024-09-17 12:58:43 -03:00
|
|
|
strncpy(tag, new_token.start, (size_t)len);
|
2020-04-30 16:12:19 -03:00
|
|
|
tag[len] = '\0';
|
|
|
|
// Ownership of tag passes to the growable array
|
|
|
|
if (!growable_comment_array_add(&p->type_ignore_comments, p->tok->lineno, tag)) {
|
|
|
|
PyErr_NoMemory();
|
2023-04-26 22:33:31 -03:00
|
|
|
goto error;
|
2020-04-30 16:12:19 -03:00
|
|
|
}
|
2022-10-06 20:07:17 -03:00
|
|
|
type = _PyTokenizer_Get(p->tok, &new_token);
|
2020-04-30 16:12:19 -03:00
|
|
|
}
|
|
|
|
|
2021-04-08 21:32:25 -03:00
|
|
|
// If we have reached the end and we are in single input mode we need to insert a newline and reset the parsing
|
|
|
|
if (p->start_rule == Py_single_input && type == ENDMARKER && p->parsing_started) {
|
2020-04-22 19:29:27 -03:00
|
|
|
type = NEWLINE; /* Add an extra newline */
|
|
|
|
p->parsing_started = 0;
|
|
|
|
|
2020-04-27 14:35:58 -03:00
|
|
|
if (p->tok->indent && !(p->flags & PyPARSE_DONT_IMPLY_DEDENT)) {
|
2020-04-22 19:29:27 -03:00
|
|
|
p->tok->pendin = -p->tok->indent;
|
|
|
|
p->tok->indent = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
p->parsing_started = 1;
|
|
|
|
}
|
|
|
|
|
2021-04-08 21:32:25 -03:00
|
|
|
// Check if we are at the limit of the token array capacity and resize if needed
|
|
|
|
if ((p->fill == p->size) && (_resize_tokens_array(p) != 0)) {
|
2023-04-26 22:33:31 -03:00
|
|
|
goto error;
|
2021-04-08 20:05:44 -03:00
|
|
|
}
|
2020-04-22 19:29:27 -03:00
|
|
|
|
2021-04-08 21:32:25 -03:00
|
|
|
Token *t = p->tokens[p->fill];
|
2022-10-06 20:07:17 -03:00
|
|
|
return initialize_token(p, t, &new_token, type);
|
2023-04-26 22:33:31 -03:00
|
|
|
error:
|
2023-05-20 21:03:02 -03:00
|
|
|
_PyToken_Free(&new_token);
|
2023-04-26 22:33:31 -03:00
|
|
|
return -1;
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
|
|
|
|
2021-04-08 21:17:31 -03:00
|
|
|
#if defined(Py_DEBUG)
|
2020-04-22 19:29:27 -03:00
|
|
|
// Instrumentation to count the effectiveness of memoization.
|
|
|
|
// The array counts the number of tokens skipped by memoization,
|
|
|
|
// indexed by type.
|
|
|
|
|
2022-12-07 18:56:31 -04:00
|
|
|
#define NSTATISTICS _PYPEGEN_NSTATISTICS
|
|
|
|
#define memo_statistics _PyRuntime.parser.memo_statistics
|
2020-04-22 19:29:27 -03:00
|
|
|
|
2024-08-06 08:29:57 -03:00
|
|
|
#ifdef Py_GIL_DISABLED
|
|
|
|
#define MUTEX_LOCK() PyMutex_Lock(&_PyRuntime.parser.mutex)
|
|
|
|
#define MUTEX_UNLOCK() PyMutex_Unlock(&_PyRuntime.parser.mutex)
|
|
|
|
#else
|
|
|
|
#define MUTEX_LOCK()
|
|
|
|
#define MUTEX_UNLOCK()
|
|
|
|
#endif
|
|
|
|
|
2020-04-22 19:29:27 -03:00
|
|
|
void
|
2023-03-28 05:52:22 -03:00
|
|
|
_PyPegen_clear_memo_statistics(void)
|
2020-04-22 19:29:27 -03:00
|
|
|
{
|
2024-08-06 08:29:57 -03:00
|
|
|
MUTEX_LOCK();
|
2020-04-22 19:29:27 -03:00
|
|
|
for (int i = 0; i < NSTATISTICS; i++) {
|
|
|
|
memo_statistics[i] = 0;
|
|
|
|
}
|
2024-08-06 08:29:57 -03:00
|
|
|
MUTEX_UNLOCK();
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
|
|
|
|
|
|
|
PyObject *
|
2023-03-28 05:52:22 -03:00
|
|
|
_PyPegen_get_memo_statistics(void)
|
2020-04-22 19:29:27 -03:00
|
|
|
{
|
|
|
|
PyObject *ret = PyList_New(NSTATISTICS);
|
|
|
|
if (ret == NULL) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2024-08-06 08:29:57 -03:00
|
|
|
|
|
|
|
MUTEX_LOCK();
|
2020-04-22 19:29:27 -03:00
|
|
|
for (int i = 0; i < NSTATISTICS; i++) {
|
|
|
|
PyObject *value = PyLong_FromLong(memo_statistics[i]);
|
|
|
|
if (value == NULL) {
|
2024-08-06 08:29:57 -03:00
|
|
|
MUTEX_UNLOCK();
|
2020-04-22 19:29:27 -03:00
|
|
|
Py_DECREF(ret);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
// PyList_SetItem borrows a reference to value.
|
|
|
|
if (PyList_SetItem(ret, i, value) < 0) {
|
2024-08-06 08:29:57 -03:00
|
|
|
MUTEX_UNLOCK();
|
2020-04-22 19:29:27 -03:00
|
|
|
Py_DECREF(ret);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
2024-08-06 08:29:57 -03:00
|
|
|
MUTEX_UNLOCK();
|
2020-04-22 19:29:27 -03:00
|
|
|
return ret;
|
|
|
|
}
|
2021-04-08 21:17:31 -03:00
|
|
|
#endif
|
2020-04-22 19:29:27 -03:00
|
|
|
|
|
|
|
int // bool
|
|
|
|
_PyPegen_is_memoized(Parser *p, int type, void *pres)
|
|
|
|
{
|
|
|
|
if (p->mark == p->fill) {
|
|
|
|
if (_PyPegen_fill_token(p) < 0) {
|
2020-04-23 12:36:06 -03:00
|
|
|
p->error_indicator = 1;
|
2020-04-22 19:29:27 -03:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Token *t = p->tokens[p->mark];
|
|
|
|
|
|
|
|
for (Memo *m = t->memo; m != NULL; m = m->next) {
|
|
|
|
if (m->type == type) {
|
2024-07-25 08:04:22 -03:00
|
|
|
#if defined(Py_DEBUG)
|
2020-04-22 19:29:27 -03:00
|
|
|
if (0 <= type && type < NSTATISTICS) {
|
|
|
|
long count = m->mark - p->mark;
|
|
|
|
// A memoized negative result counts for one.
|
|
|
|
if (count <= 0) {
|
|
|
|
count = 1;
|
|
|
|
}
|
2024-08-06 08:29:57 -03:00
|
|
|
MUTEX_LOCK();
|
2020-04-22 19:29:27 -03:00
|
|
|
memo_statistics[type] += count;
|
2024-08-06 08:29:57 -03:00
|
|
|
MUTEX_UNLOCK();
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
2021-04-08 21:17:31 -03:00
|
|
|
#endif
|
2020-04-22 19:29:27 -03:00
|
|
|
p->mark = m->mark;
|
|
|
|
*(void **)(pres) = m->node;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-04-23 08:42:13 -03:00
|
|
|
int
|
|
|
|
_PyPegen_lookahead_with_name(int positive, expr_ty (func)(Parser *), Parser *p)
|
|
|
|
{
|
|
|
|
int mark = p->mark;
|
|
|
|
void *res = func(p);
|
|
|
|
p->mark = mark;
|
|
|
|
return (res != NULL) == positive;
|
|
|
|
}
|
|
|
|
|
2020-05-26 20:15:52 -03:00
|
|
|
int
|
|
|
|
_PyPegen_lookahead_with_string(int positive, expr_ty (func)(Parser *, const char*), Parser *p, const char* arg)
|
|
|
|
{
|
|
|
|
int mark = p->mark;
|
|
|
|
void *res = func(p, arg);
|
|
|
|
p->mark = mark;
|
|
|
|
return (res != NULL) == positive;
|
|
|
|
}
|
|
|
|
|
2020-04-22 19:29:27 -03:00
|
|
|
int
|
|
|
|
_PyPegen_lookahead_with_int(int positive, Token *(func)(Parser *, int), Parser *p, int arg)
|
|
|
|
{
|
|
|
|
int mark = p->mark;
|
|
|
|
void *res = func(p, arg);
|
|
|
|
p->mark = mark;
|
|
|
|
return (res != NULL) == positive;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
_PyPegen_lookahead(int positive, void *(func)(Parser *), Parser *p)
|
|
|
|
{
|
|
|
|
int mark = p->mark;
|
2020-04-23 08:42:13 -03:00
|
|
|
void *res = (void*)func(p);
|
2020-04-22 19:29:27 -03:00
|
|
|
p->mark = mark;
|
|
|
|
return (res != NULL) == positive;
|
|
|
|
}
|
|
|
|
|
|
|
|
Token *
|
|
|
|
_PyPegen_expect_token(Parser *p, int type)
|
|
|
|
{
|
|
|
|
if (p->mark == p->fill) {
|
|
|
|
if (_PyPegen_fill_token(p) < 0) {
|
2020-04-23 12:36:06 -03:00
|
|
|
p->error_indicator = 1;
|
2020-04-22 19:29:27 -03:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Token *t = p->tokens[p->mark];
|
|
|
|
if (t->type != type) {
|
2023-04-19 13:18:16 -03:00
|
|
|
return NULL;
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
|
|
|
p->mark += 1;
|
|
|
|
return t;
|
|
|
|
}
|
|
|
|
|
2021-08-12 13:37:30 -03:00
|
|
|
void*
|
|
|
|
_PyPegen_expect_forced_result(Parser *p, void* result, const char* expected) {
|
|
|
|
|
|
|
|
if (p->error_indicator == 1) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
if (result == NULL) {
|
|
|
|
RAISE_SYNTAX_ERROR("expected (%s)", expected);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2021-02-02 15:54:22 -04:00
|
|
|
Token *
|
|
|
|
_PyPegen_expect_forced_token(Parser *p, int type, const char* expected) {
|
|
|
|
|
|
|
|
if (p->error_indicator == 1) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (p->mark == p->fill) {
|
|
|
|
if (_PyPegen_fill_token(p) < 0) {
|
|
|
|
p->error_indicator = 1;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Token *t = p->tokens[p->mark];
|
|
|
|
if (t->type != type) {
|
|
|
|
RAISE_SYNTAX_ERROR_KNOWN_LOCATION(t, "expected '%s'", expected);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
p->mark += 1;
|
|
|
|
return t;
|
|
|
|
}
|
|
|
|
|
2020-05-26 14:58:44 -03:00
|
|
|
expr_ty
|
|
|
|
_PyPegen_expect_soft_keyword(Parser *p, const char *keyword)
|
|
|
|
{
|
|
|
|
if (p->mark == p->fill) {
|
|
|
|
if (_PyPegen_fill_token(p) < 0) {
|
|
|
|
p->error_indicator = 1;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Token *t = p->tokens[p->mark];
|
|
|
|
if (t->type != NAME) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2021-06-12 10:11:59 -03:00
|
|
|
const char *s = PyBytes_AsString(t->bytes);
|
2020-05-26 14:58:44 -03:00
|
|
|
if (!s) {
|
2020-05-27 13:04:11 -03:00
|
|
|
p->error_indicator = 1;
|
2020-05-26 14:58:44 -03:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
if (strcmp(s, keyword) != 0) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2020-05-27 13:04:11 -03:00
|
|
|
return _PyPegen_name_token(p);
|
2020-05-26 14:58:44 -03:00
|
|
|
}
|
|
|
|
|
2020-04-22 19:29:27 -03:00
|
|
|
Token *
|
|
|
|
_PyPegen_get_last_nonnwhitespace_token(Parser *p)
|
|
|
|
{
|
|
|
|
assert(p->mark >= 0);
|
|
|
|
Token *token = NULL;
|
|
|
|
for (int m = p->mark - 1; m >= 0; m--) {
|
|
|
|
token = p->tokens[m];
|
|
|
|
if (token->type != ENDMARKER && (token->type < NEWLINE || token->type > DEDENT)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return token;
|
|
|
|
}
|
|
|
|
|
2021-11-20 21:08:50 -04:00
|
|
|
PyObject *
|
|
|
|
_PyPegen_new_identifier(Parser *p, const char *n)
|
|
|
|
{
|
2024-09-17 12:58:43 -03:00
|
|
|
PyObject *id = PyUnicode_DecodeUTF8(n, (Py_ssize_t)strlen(n), NULL);
|
2021-11-20 21:08:50 -04:00
|
|
|
if (!id) {
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
/* PyUnicode_DecodeUTF8 should always return a ready string. */
|
|
|
|
assert(PyUnicode_IS_READY(id));
|
|
|
|
/* Check whether there are non-ASCII characters in the
|
|
|
|
identifier; if so, normalize to NFKC. */
|
|
|
|
if (!PyUnicode_IS_ASCII(id))
|
|
|
|
{
|
|
|
|
if (!init_normalization(p))
|
|
|
|
{
|
|
|
|
Py_DECREF(id);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
PyObject *form = PyUnicode_InternFromString("NFKC");
|
|
|
|
if (form == NULL)
|
|
|
|
{
|
|
|
|
Py_DECREF(id);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
PyObject *args[2] = {form, id};
|
2023-06-29 22:05:01 -03:00
|
|
|
PyObject *id2 = PyObject_Vectorcall(p->normalize, args, 2, NULL);
|
2021-11-20 21:08:50 -04:00
|
|
|
Py_DECREF(id);
|
|
|
|
Py_DECREF(form);
|
|
|
|
if (!id2) {
|
|
|
|
goto error;
|
|
|
|
}
|
2023-06-29 22:05:01 -03:00
|
|
|
|
2021-11-20 21:08:50 -04:00
|
|
|
if (!PyUnicode_Check(id2))
|
|
|
|
{
|
|
|
|
PyErr_Format(PyExc_TypeError,
|
|
|
|
"unicodedata.normalize() must return a string, not "
|
|
|
|
"%.200s",
|
|
|
|
_PyType_Name(Py_TYPE(id2)));
|
|
|
|
Py_DECREF(id2);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
id = id2;
|
|
|
|
}
|
2024-06-21 12:19:31 -03:00
|
|
|
PyInterpreterState *interp = _PyInterpreterState_GET();
|
|
|
|
_PyUnicode_InternImmortal(interp, &id);
|
2021-11-20 21:08:50 -04:00
|
|
|
if (_PyArena_AddPyObject(p->arena, id) < 0)
|
|
|
|
{
|
|
|
|
Py_DECREF(id);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
return id;
|
|
|
|
|
|
|
|
error:
|
|
|
|
p->error_indicator = 1;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2021-06-09 18:20:01 -03:00
|
|
|
static expr_ty
|
|
|
|
_PyPegen_name_from_token(Parser *p, Token* t)
|
2020-04-22 19:29:27 -03:00
|
|
|
{
|
|
|
|
if (t == NULL) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2021-06-12 10:11:59 -03:00
|
|
|
const char *s = PyBytes_AsString(t->bytes);
|
2020-04-22 19:29:27 -03:00
|
|
|
if (!s) {
|
2020-05-27 13:04:11 -03:00
|
|
|
p->error_indicator = 1;
|
2020-04-22 19:29:27 -03:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
PyObject *id = _PyPegen_new_identifier(p, s);
|
|
|
|
if (id == NULL) {
|
2020-05-27 13:04:11 -03:00
|
|
|
p->error_indicator = 1;
|
2020-04-22 19:29:27 -03:00
|
|
|
return NULL;
|
|
|
|
}
|
2021-04-07 16:34:22 -03:00
|
|
|
return _PyAST_Name(id, Load, t->lineno, t->col_offset, t->end_lineno,
|
|
|
|
t->end_col_offset, p->arena);
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
|
|
|
|
2021-06-09 18:20:01 -03:00
|
|
|
expr_ty
|
|
|
|
_PyPegen_name_token(Parser *p)
|
|
|
|
{
|
|
|
|
Token *t = _PyPegen_expect_token(p, NAME);
|
|
|
|
return _PyPegen_name_from_token(p, t);
|
|
|
|
}
|
|
|
|
|
2020-04-22 19:29:27 -03:00
|
|
|
void *
|
|
|
|
_PyPegen_string_token(Parser *p)
|
|
|
|
{
|
|
|
|
return _PyPegen_expect_token(p, STRING);
|
|
|
|
}
|
|
|
|
|
2021-04-15 17:38:45 -03:00
|
|
|
expr_ty _PyPegen_soft_keyword_token(Parser *p) {
|
|
|
|
Token *t = _PyPegen_expect_token(p, NAME);
|
|
|
|
if (t == NULL) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
char *the_token;
|
|
|
|
Py_ssize_t size;
|
|
|
|
PyBytes_AsStringAndSize(t->bytes, &the_token, &size);
|
|
|
|
for (char **keyword = p->soft_keywords; *keyword != NULL; keyword++) {
|
2024-09-17 12:58:43 -03:00
|
|
|
if (strncmp(*keyword, the_token, (size_t)size) == 0) {
|
2021-06-09 18:20:01 -03:00
|
|
|
return _PyPegen_name_from_token(p, t);
|
2021-04-15 17:38:45 -03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2020-04-22 19:29:27 -03:00
|
|
|
static PyObject *
|
|
|
|
parsenumber_raw(const char *s)
|
|
|
|
{
|
|
|
|
const char *end;
|
|
|
|
long x;
|
|
|
|
double dx;
|
|
|
|
Py_complex compl;
|
|
|
|
int imflag;
|
|
|
|
|
|
|
|
assert(s != NULL);
|
|
|
|
errno = 0;
|
|
|
|
end = s + strlen(s) - 1;
|
|
|
|
imflag = *end == 'j' || *end == 'J';
|
|
|
|
if (s[0] == '0') {
|
|
|
|
x = (long)PyOS_strtoul(s, (char **)&end, 0);
|
|
|
|
if (x < 0 && errno == 0) {
|
|
|
|
return PyLong_FromString(s, (char **)0, 0);
|
|
|
|
}
|
|
|
|
}
|
2020-06-15 10:23:43 -03:00
|
|
|
else {
|
2020-04-22 19:29:27 -03:00
|
|
|
x = PyOS_strtol(s, (char **)&end, 0);
|
2020-06-15 10:23:43 -03:00
|
|
|
}
|
2020-04-22 19:29:27 -03:00
|
|
|
if (*end == '\0') {
|
2020-06-15 10:23:43 -03:00
|
|
|
if (errno != 0) {
|
2020-04-22 19:29:27 -03:00
|
|
|
return PyLong_FromString(s, (char **)0, 0);
|
2020-06-15 10:23:43 -03:00
|
|
|
}
|
2020-04-22 19:29:27 -03:00
|
|
|
return PyLong_FromLong(x);
|
|
|
|
}
|
|
|
|
/* XXX Huge floats may silently fail */
|
|
|
|
if (imflag) {
|
|
|
|
compl.real = 0.;
|
|
|
|
compl.imag = PyOS_string_to_double(s, (char **)&end, NULL);
|
2020-06-15 10:23:43 -03:00
|
|
|
if (compl.imag == -1.0 && PyErr_Occurred()) {
|
2020-04-22 19:29:27 -03:00
|
|
|
return NULL;
|
2020-06-15 10:23:43 -03:00
|
|
|
}
|
2020-04-22 19:29:27 -03:00
|
|
|
return PyComplex_FromCComplex(compl);
|
|
|
|
}
|
2020-06-15 10:23:43 -03:00
|
|
|
dx = PyOS_string_to_double(s, NULL, NULL);
|
|
|
|
if (dx == -1.0 && PyErr_Occurred()) {
|
|
|
|
return NULL;
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
2020-06-15 10:23:43 -03:00
|
|
|
return PyFloat_FromDouble(dx);
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
|
|
|
|
|
|
|
static PyObject *
|
|
|
|
parsenumber(const char *s)
|
|
|
|
{
|
2020-06-15 10:23:43 -03:00
|
|
|
char *dup;
|
|
|
|
char *end;
|
2020-04-22 19:29:27 -03:00
|
|
|
PyObject *res = NULL;
|
|
|
|
|
|
|
|
assert(s != NULL);
|
|
|
|
|
|
|
|
if (strchr(s, '_') == NULL) {
|
|
|
|
return parsenumber_raw(s);
|
|
|
|
}
|
|
|
|
/* Create a duplicate without underscores. */
|
|
|
|
dup = PyMem_Malloc(strlen(s) + 1);
|
|
|
|
if (dup == NULL) {
|
|
|
|
return PyErr_NoMemory();
|
|
|
|
}
|
|
|
|
end = dup;
|
|
|
|
for (; *s; s++) {
|
|
|
|
if (*s != '_') {
|
|
|
|
*end++ = *s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
*end = '\0';
|
|
|
|
res = parsenumber_raw(dup);
|
|
|
|
PyMem_Free(dup);
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
expr_ty
|
|
|
|
_PyPegen_number_token(Parser *p)
|
|
|
|
{
|
|
|
|
Token *t = _PyPegen_expect_token(p, NUMBER);
|
|
|
|
if (t == NULL) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2021-06-12 10:11:59 -03:00
|
|
|
const char *num_raw = PyBytes_AsString(t->bytes);
|
2020-04-22 19:29:27 -03:00
|
|
|
if (num_raw == NULL) {
|
2020-05-27 13:04:11 -03:00
|
|
|
p->error_indicator = 1;
|
2020-04-22 19:29:27 -03:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2020-05-01 00:27:52 -03:00
|
|
|
if (p->feature_version < 6 && strchr(num_raw, '_') != NULL) {
|
|
|
|
p->error_indicator = 1;
|
2020-05-04 05:13:30 -03:00
|
|
|
return RAISE_SYNTAX_ERROR("Underscores in numeric literals are only supported "
|
2020-05-01 00:27:52 -03:00
|
|
|
"in Python 3.6 and greater");
|
|
|
|
}
|
|
|
|
|
2020-04-22 19:29:27 -03:00
|
|
|
PyObject *c = parsenumber(num_raw);
|
|
|
|
|
|
|
|
if (c == NULL) {
|
2020-05-27 13:04:11 -03:00
|
|
|
p->error_indicator = 1;
|
2022-09-02 13:35:08 -03:00
|
|
|
PyThreadState *tstate = _PyThreadState_GET();
|
|
|
|
// The only way a ValueError should happen in _this_ code is via
|
|
|
|
// PyLong_FromString hitting a length limit.
|
2023-02-08 05:31:12 -04:00
|
|
|
if (tstate->current_exception != NULL &&
|
|
|
|
Py_TYPE(tstate->current_exception) == (PyTypeObject *)PyExc_ValueError
|
|
|
|
) {
|
|
|
|
PyObject *exc = PyErr_GetRaisedException();
|
2022-09-02 13:35:08 -03:00
|
|
|
/* Intentionally omitting columns to avoid a wall of 1000s of '^'s
|
|
|
|
* on the error message. Nobody is going to overlook their huge
|
|
|
|
* numeric literal once given the line. */
|
|
|
|
RAISE_ERROR_KNOWN_LOCATION(
|
|
|
|
p, PyExc_SyntaxError,
|
|
|
|
t->lineno, -1 /* col_offset */,
|
|
|
|
t->end_lineno, -1 /* end_col_offset */,
|
|
|
|
"%S - Consider hexadecimal for huge integer literals "
|
|
|
|
"to avoid decimal conversion limits.",
|
2023-02-08 05:31:12 -04:00
|
|
|
exc);
|
|
|
|
Py_DECREF(exc);
|
2022-09-02 13:35:08 -03:00
|
|
|
}
|
2020-04-22 19:29:27 -03:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2021-03-23 22:23:01 -03:00
|
|
|
if (_PyArena_AddPyObject(p->arena, c) < 0) {
|
2020-04-22 19:29:27 -03:00
|
|
|
Py_DECREF(c);
|
2020-05-27 13:04:11 -03:00
|
|
|
p->error_indicator = 1;
|
2020-04-22 19:29:27 -03:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2021-04-07 16:34:22 -03:00
|
|
|
return _PyAST_Constant(c, NULL, t->lineno, t->col_offset, t->end_lineno,
|
|
|
|
t->end_col_offset, p->arena);
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
|
|
|
|
2020-04-28 22:42:27 -03:00
|
|
|
/* Check that the source for a single input statement really is a single
|
|
|
|
statement by looking at what is left in the buffer after parsing.
|
|
|
|
Trailing whitespace and comments are OK. */
|
|
|
|
static int // bool
|
|
|
|
bad_single_statement(Parser *p)
|
|
|
|
{
|
2021-12-10 19:44:26 -04:00
|
|
|
char *cur = p->tok->cur;
|
2020-04-28 22:42:27 -03:00
|
|
|
char c = *cur;
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
while (c == ' ' || c == '\t' || c == '\n' || c == '\014') {
|
|
|
|
c = *++cur;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!c) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (c != '#') {
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Suck up comment. */
|
|
|
|
while (c && c != '\n') {
|
|
|
|
c = *++cur;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-04-27 14:02:07 -03:00
|
|
|
static int
|
|
|
|
compute_parser_flags(PyCompilerFlags *flags)
|
|
|
|
{
|
|
|
|
int parser_flags = 0;
|
|
|
|
if (!flags) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (flags->cf_flags & PyCF_DONT_IMPLY_DEDENT) {
|
|
|
|
parser_flags |= PyPARSE_DONT_IMPLY_DEDENT;
|
|
|
|
}
|
|
|
|
if (flags->cf_flags & PyCF_IGNORE_COOKIE) {
|
|
|
|
parser_flags |= PyPARSE_IGNORE_COOKIE;
|
|
|
|
}
|
|
|
|
if (flags->cf_flags & CO_FUTURE_BARRY_AS_BDFL) {
|
|
|
|
parser_flags |= PyPARSE_BARRY_AS_BDFL;
|
|
|
|
}
|
|
|
|
if (flags->cf_flags & PyCF_TYPE_COMMENTS) {
|
|
|
|
parser_flags |= PyPARSE_TYPE_COMMENTS;
|
|
|
|
}
|
2022-02-08 07:54:37 -04:00
|
|
|
if (flags->cf_flags & PyCF_ALLOW_INCOMPLETE_INPUT) {
|
|
|
|
parser_flags |= PyPARSE_ALLOW_INCOMPLETE_INPUT;
|
|
|
|
}
|
2020-04-27 14:02:07 -03:00
|
|
|
return parser_flags;
|
|
|
|
}
|
|
|
|
|
2021-11-20 21:08:50 -04:00
|
|
|
// Parser API
|
|
|
|
|
2020-04-22 19:29:27 -03:00
|
|
|
Parser *
|
2020-04-27 14:02:07 -03:00
|
|
|
_PyPegen_Parser_New(struct tok_state *tok, int start_rule, int flags,
|
2020-05-01 00:27:52 -03:00
|
|
|
int feature_version, int *errcode, PyArena *arena)
|
2020-04-22 19:29:27 -03:00
|
|
|
{
|
|
|
|
Parser *p = PyMem_Malloc(sizeof(Parser));
|
|
|
|
if (p == NULL) {
|
2020-04-23 12:36:06 -03:00
|
|
|
return (Parser *) PyErr_NoMemory();
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
|
|
|
assert(tok != NULL);
|
2020-05-01 13:42:32 -03:00
|
|
|
tok->type_comments = (flags & PyPARSE_TYPE_COMMENTS) > 0;
|
2020-04-22 19:29:27 -03:00
|
|
|
p->tok = tok;
|
|
|
|
p->keywords = NULL;
|
|
|
|
p->n_keyword_lists = -1;
|
2021-04-15 17:38:45 -03:00
|
|
|
p->soft_keywords = NULL;
|
2020-04-22 19:29:27 -03:00
|
|
|
p->tokens = PyMem_Malloc(sizeof(Token *));
|
|
|
|
if (!p->tokens) {
|
|
|
|
PyMem_Free(p);
|
2020-04-23 12:36:06 -03:00
|
|
|
return (Parser *) PyErr_NoMemory();
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
2020-04-30 16:12:19 -03:00
|
|
|
p->tokens[0] = PyMem_Calloc(1, sizeof(Token));
|
2022-07-28 07:00:34 -03:00
|
|
|
if (!p->tokens[0]) {
|
2020-04-23 12:36:06 -03:00
|
|
|
PyMem_Free(p->tokens);
|
|
|
|
PyMem_Free(p);
|
|
|
|
return (Parser *) PyErr_NoMemory();
|
|
|
|
}
|
2020-04-30 16:12:19 -03:00
|
|
|
if (!growable_comment_array_init(&p->type_ignore_comments, 10)) {
|
|
|
|
PyMem_Free(p->tokens[0]);
|
|
|
|
PyMem_Free(p->tokens);
|
|
|
|
PyMem_Free(p);
|
|
|
|
return (Parser *) PyErr_NoMemory();
|
|
|
|
}
|
|
|
|
|
2020-04-22 19:29:27 -03:00
|
|
|
p->mark = 0;
|
|
|
|
p->fill = 0;
|
|
|
|
p->size = 1;
|
|
|
|
|
|
|
|
p->errcode = errcode;
|
|
|
|
p->arena = arena;
|
|
|
|
p->start_rule = start_rule;
|
|
|
|
p->parsing_started = 0;
|
|
|
|
p->normalize = NULL;
|
|
|
|
p->error_indicator = 0;
|
|
|
|
|
|
|
|
p->starting_lineno = 0;
|
|
|
|
p->starting_col_offset = 0;
|
2020-04-27 14:02:07 -03:00
|
|
|
p->flags = flags;
|
2020-05-01 00:27:52 -03:00
|
|
|
p->feature_version = feature_version;
|
2020-05-07 07:37:51 -03:00
|
|
|
p->known_err_token = NULL;
|
2020-05-25 14:38:45 -03:00
|
|
|
p->level = 0;
|
2020-10-26 19:42:04 -03:00
|
|
|
p->call_invalid_rules = 0;
|
2022-05-24 17:35:08 -03:00
|
|
|
#ifdef Py_DEBUG
|
|
|
|
p->debug = _Py_GetConfig()->parser_debug;
|
|
|
|
#endif
|
2020-04-22 19:29:27 -03:00
|
|
|
return p;
|
|
|
|
}
|
|
|
|
|
2021-11-20 21:08:50 -04:00
|
|
|
void
|
|
|
|
_PyPegen_Parser_Free(Parser *p)
|
|
|
|
{
|
|
|
|
Py_XDECREF(p->normalize);
|
|
|
|
for (int i = 0; i < p->size; i++) {
|
|
|
|
PyMem_Free(p->tokens[i]);
|
|
|
|
}
|
|
|
|
PyMem_Free(p->tokens);
|
|
|
|
growable_comment_array_deallocate(&p->type_ignore_comments);
|
|
|
|
PyMem_Free(p);
|
|
|
|
}
|
|
|
|
|
2020-10-26 19:42:04 -03:00
|
|
|
static void
|
2021-11-20 21:08:50 -04:00
|
|
|
reset_parser_state_for_error_pass(Parser *p)
|
2020-10-26 19:42:04 -03:00
|
|
|
{
|
|
|
|
for (int i = 0; i < p->fill; i++) {
|
|
|
|
p->tokens[i]->memo = NULL;
|
|
|
|
}
|
|
|
|
p->mark = 0;
|
|
|
|
p->call_invalid_rules = 1;
|
2021-05-22 19:05:00 -03:00
|
|
|
// Don't try to get extra tokens in interactive mode when trying to
|
|
|
|
// raise specialized errors in the second pass.
|
|
|
|
p->tok->interactive_underflow = IUNDERFLOW_STOP;
|
2020-10-26 19:42:04 -03:00
|
|
|
}
|
|
|
|
|
2022-02-08 07:54:37 -04:00
|
|
|
static inline int
|
|
|
|
_is_end_of_source(Parser *p) {
|
|
|
|
int err = p->tok->done;
|
|
|
|
return err == E_EOF || err == E_EOFS || err == E_EOLS;
|
|
|
|
}
|
|
|
|
|
2020-04-22 19:29:27 -03:00
|
|
|
void *
|
|
|
|
_PyPegen_run_parser(Parser *p)
|
|
|
|
{
|
|
|
|
void *res = _PyPegen_parse(p);
|
2022-01-03 15:54:06 -04:00
|
|
|
assert(p->level == 0);
|
2020-04-22 19:29:27 -03:00
|
|
|
if (res == NULL) {
|
2022-02-08 07:54:37 -04:00
|
|
|
if ((p->flags & PyPARSE_ALLOW_INCOMPLETE_INPUT) && _is_end_of_source(p)) {
|
|
|
|
PyErr_Clear();
|
2024-01-30 12:21:30 -04:00
|
|
|
return _PyPegen_raise_error(p, PyExc_IncompleteInputError, 0, "incomplete input");
|
2022-02-08 07:54:37 -04:00
|
|
|
}
|
2021-10-07 18:33:05 -03:00
|
|
|
if (PyErr_Occurred() && !PyErr_ExceptionMatches(PyExc_SyntaxError)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2022-02-08 07:54:37 -04:00
|
|
|
// Make a second parser pass. In this pass we activate heavier and slower checks
|
2021-11-20 21:08:50 -04:00
|
|
|
// to produce better error messages and more complete diagnostics. Extra "invalid_*"
|
|
|
|
// rules will be active during parsing.
|
2021-05-21 12:09:51 -03:00
|
|
|
Token *last_token = p->tokens[p->fill - 1];
|
2021-11-20 21:08:50 -04:00
|
|
|
reset_parser_state_for_error_pass(p);
|
2020-10-26 19:42:04 -03:00
|
|
|
_PyPegen_parse(p);
|
2021-11-20 21:08:50 -04:00
|
|
|
|
|
|
|
// Set SyntaxErrors accordingly depending on the parser/tokenizer status at the failure
|
|
|
|
// point.
|
|
|
|
_Pypegen_set_syntax_error(p, last_token);
|
|
|
|
return NULL;
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
|
|
|
|
2020-04-28 22:42:27 -03:00
|
|
|
if (p->start_rule == Py_single_input && bad_single_statement(p)) {
|
|
|
|
p->tok->done = E_BADSINGLE; // This is not necessary for now, but might be in the future
|
|
|
|
return RAISE_SYNTAX_ERROR("multiple statements found while compiling a single statement");
|
|
|
|
}
|
|
|
|
|
2021-03-17 22:46:06 -03:00
|
|
|
// test_peg_generator defines _Py_TEST_PEGEN to not call PyAST_Validate()
|
|
|
|
#if defined(Py_DEBUG) && !defined(_Py_TEST_PEGEN)
|
2020-07-27 19:46:59 -03:00
|
|
|
if (p->start_rule == Py_single_input ||
|
|
|
|
p->start_rule == Py_file_input ||
|
|
|
|
p->start_rule == Py_eval_input)
|
|
|
|
{
|
2021-03-18 10:57:49 -03:00
|
|
|
if (!_PyAST_Validate(res)) {
|
2020-10-30 08:48:41 -03:00
|
|
|
return NULL;
|
|
|
|
}
|
2020-07-27 19:46:59 -03:00
|
|
|
}
|
|
|
|
#endif
|
2020-04-22 19:29:27 -03:00
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
mod_ty
|
|
|
|
_PyPegen_run_parser_from_file_pointer(FILE *fp, int start_rule, PyObject *filename_ob,
|
|
|
|
const char *enc, const char *ps1, const char *ps2,
|
2023-10-13 06:25:37 -03:00
|
|
|
PyCompilerFlags *flags, int *errcode,
|
|
|
|
PyObject **interactive_src, PyArena *arena)
|
2020-04-22 19:29:27 -03:00
|
|
|
{
|
2021-10-13 12:22:14 -03:00
|
|
|
struct tok_state *tok = _PyTokenizer_FromFile(fp, enc, ps1, ps2);
|
2020-04-22 19:29:27 -03:00
|
|
|
if (tok == NULL) {
|
|
|
|
if (PyErr_Occurred()) {
|
2021-11-20 21:08:50 -04:00
|
|
|
_PyPegen_raise_tokenizer_init_error(filename_ob);
|
2020-04-22 19:29:27 -03:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
2021-03-13 23:38:40 -04:00
|
|
|
if (!tok->fp || ps1 != NULL || ps2 != NULL ||
|
|
|
|
PyUnicode_CompareWithASCIIString(filename_ob, "<stdin>") == 0) {
|
|
|
|
tok->fp_interactive = 1;
|
|
|
|
}
|
2020-04-22 19:29:27 -03:00
|
|
|
// This transfers the ownership to the tokenizer
|
2022-11-10 10:30:05 -04:00
|
|
|
tok->filename = Py_NewRef(filename_ob);
|
2020-04-22 19:29:27 -03:00
|
|
|
|
|
|
|
// From here on we need to clean up even if there's an error
|
|
|
|
mod_ty result = NULL;
|
|
|
|
|
2020-04-27 14:02:07 -03:00
|
|
|
int parser_flags = compute_parser_flags(flags);
|
2020-05-01 00:27:52 -03:00
|
|
|
Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, PY_MINOR_VERSION,
|
|
|
|
errcode, arena);
|
2020-04-22 19:29:27 -03:00
|
|
|
if (p == NULL) {
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
result = _PyPegen_run_parser(p);
|
|
|
|
_PyPegen_Parser_Free(p);
|
|
|
|
|
2023-10-13 06:25:37 -03:00
|
|
|
if (tok->fp_interactive && tok->interactive_src_start && result && interactive_src != NULL) {
|
|
|
|
*interactive_src = PyUnicode_FromString(tok->interactive_src_start);
|
|
|
|
if (!interactive_src || _PyArena_AddPyObject(arena, *interactive_src) < 0) {
|
|
|
|
Py_XDECREF(interactive_src);
|
|
|
|
result = NULL;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-04-22 19:29:27 -03:00
|
|
|
error:
|
2021-10-13 12:22:14 -03:00
|
|
|
_PyTokenizer_Free(tok);
|
2020-04-22 19:29:27 -03:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
mod_ty
|
|
|
|
_PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filename_ob,
|
2020-04-27 14:02:07 -03:00
|
|
|
PyCompilerFlags *flags, PyArena *arena)
|
2020-04-22 19:29:27 -03:00
|
|
|
{
|
|
|
|
int exec_input = start_rule == Py_file_input;
|
|
|
|
|
|
|
|
struct tok_state *tok;
|
2021-11-16 16:30:47 -04:00
|
|
|
if (flags != NULL && flags->cf_flags & PyCF_IGNORE_COOKIE) {
|
2023-05-28 11:15:53 -03:00
|
|
|
tok = _PyTokenizer_FromUTF8(str, exec_input, 0);
|
2020-04-22 19:29:27 -03:00
|
|
|
} else {
|
2023-05-28 11:15:53 -03:00
|
|
|
tok = _PyTokenizer_FromString(str, exec_input, 0);
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
|
|
|
if (tok == NULL) {
|
|
|
|
if (PyErr_Occurred()) {
|
2021-11-20 21:08:50 -04:00
|
|
|
_PyPegen_raise_tokenizer_init_error(filename_ob);
|
2020-04-22 19:29:27 -03:00
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
// This transfers the ownership to the tokenizer
|
2022-11-10 10:30:05 -04:00
|
|
|
tok->filename = Py_NewRef(filename_ob);
|
2020-04-22 19:29:27 -03:00
|
|
|
|
|
|
|
// We need to clear up from here on
|
|
|
|
mod_ty result = NULL;
|
|
|
|
|
2020-04-27 14:02:07 -03:00
|
|
|
int parser_flags = compute_parser_flags(flags);
|
2020-06-27 21:33:49 -03:00
|
|
|
int feature_version = flags && (flags->cf_flags & PyCF_ONLY_AST) ?
|
|
|
|
flags->cf_feature_version : PY_MINOR_VERSION;
|
2020-05-01 00:27:52 -03:00
|
|
|
Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, feature_version,
|
|
|
|
NULL, arena);
|
2020-04-22 19:29:27 -03:00
|
|
|
if (p == NULL) {
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
result = _PyPegen_run_parser(p);
|
|
|
|
_PyPegen_Parser_Free(p);
|
|
|
|
|
|
|
|
error:
|
2021-10-13 12:22:14 -03:00
|
|
|
_PyTokenizer_Free(tok);
|
2020-04-22 19:29:27 -03:00
|
|
|
return result;
|
2021-11-24 18:21:23 -04:00
|
|
|
}
|