#include #include #include "tokenizer.h" #include "pegen.h" // TOKENIZER ERRORS void _PyPegen_raise_tokenizer_init_error(PyObject *filename) { if (!(PyErr_ExceptionMatches(PyExc_LookupError) || PyErr_ExceptionMatches(PyExc_SyntaxError) || PyErr_ExceptionMatches(PyExc_ValueError) || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) { return; } PyObject *errstr = NULL; PyObject *tuple = NULL; PyObject *type; PyObject *value; PyObject *tback; PyErr_Fetch(&type, &value, &tback); errstr = PyObject_Str(value); if (!errstr) { goto error; } PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None); if (!tmp) { goto error; } tuple = PyTuple_Pack(2, errstr, tmp); Py_DECREF(tmp); if (!value) { goto error; } PyErr_SetObject(PyExc_SyntaxError, tuple); error: Py_XDECREF(type); Py_XDECREF(value); Py_XDECREF(tback); Py_XDECREF(errstr); Py_XDECREF(tuple); } static inline void raise_unclosed_parentheses_error(Parser *p) { int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; int error_col = p->tok->parencolstack[p->tok->level-1]; RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, error_lineno, error_col, error_lineno, -1, "'%c' was never closed", p->tok->parenstack[p->tok->level-1]); } int _Pypegen_tokenizer_error(Parser *p) { if (PyErr_Occurred()) { return -1; } const char *msg = NULL; PyObject* errtype = PyExc_SyntaxError; Py_ssize_t col_offset = -1; switch (p->tok->done) { case E_TOKEN: msg = "invalid token"; break; case E_EOF: if (p->tok->level) { raise_unclosed_parentheses_error(p); } else { RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); } return -1; case E_DEDENT: RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level"); return -1; case E_INTR: if (!PyErr_Occurred()) { PyErr_SetNone(PyExc_KeyboardInterrupt); } return -1; case E_NOMEM: PyErr_NoMemory(); return -1; case E_TABSPACE: errtype = PyExc_TabError; msg = "inconsistent use of tabs and spaces in indentation"; break; case E_TOODEEP: errtype = PyExc_IndentationError; msg = "too many levels of indentation"; break; case E_LINECONT: { col_offset = p->tok->cur - p->tok->buf - 1; msg = "unexpected character after line continuation character"; break; } default: msg = "unknown parsing error"; } RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno, col_offset >= 0 ? col_offset : 0, p->tok->lineno, -1, msg); return -1; } int _Pypegen_raise_decode_error(Parser *p) { assert(PyErr_Occurred()); const char *errtype = NULL; if (PyErr_ExceptionMatches(PyExc_UnicodeError)) { errtype = "unicode error"; } else if (PyErr_ExceptionMatches(PyExc_ValueError)) { errtype = "value error"; } if (errtype) { PyObject *type; PyObject *value; PyObject *tback; PyObject *errstr; PyErr_Fetch(&type, &value, &tback); errstr = PyObject_Str(value); if (errstr) { RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr); Py_DECREF(errstr); } else { PyErr_Clear(); RAISE_SYNTAX_ERROR("(%s) unknown error", errtype); } Py_XDECREF(type); Py_XDECREF(value); Py_XDECREF(tback); } return -1; } static int _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) { // Tokenize the whole input to see if there are any tokenization // errors such as mistmatching parentheses. These will get priority // over generic syntax errors only if the line number of the error is // before the one that we had for the generic error. // We don't want to tokenize to the end for interactive input if (p->tok->prompt != NULL) { return 0; } PyObject *type, *value, *traceback; PyErr_Fetch(&type, &value, &traceback); Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; Py_ssize_t current_err_line = current_token->lineno; int ret = 0; for (;;) { const char *start; const char *end; switch (_PyTokenizer_Get(p->tok, &start, &end)) { case ERRORTOKEN: if (p->tok->level != 0) { int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; if (current_err_line > error_lineno) { raise_unclosed_parentheses_error(p); ret = -1; goto exit; } } break; case ENDMARKER: break; default: continue; } break; } exit: if (PyErr_Occurred()) { Py_XDECREF(value); Py_XDECREF(type); Py_XDECREF(traceback); } else { PyErr_Restore(type, value, traceback); } return ret; } // PARSER ERRORS void * _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...) { if (p->fill == 0) { va_list va; va_start(va, errmsg); _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va); va_end(va); return NULL; } Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; Py_ssize_t col_offset; Py_ssize_t end_col_offset = -1; if (t->col_offset == -1) { if (p->tok->cur == p->tok->buf) { col_offset = 0; } else { const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf; col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int); } } else { col_offset = t->col_offset + 1; } if (t->end_col_offset != -1) { end_col_offset = t->end_col_offset + 1; } va_list va; va_start(va, errmsg); _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va); va_end(va); return NULL; } static PyObject * get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno) { /* If the file descriptor is interactive, the source lines of the current * (multi-line) statement are stored in p->tok->interactive_src_start. * If not, we're parsing from a string, which means that the whole source * is stored in p->tok->str. */ assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp != NULL); char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str; if (cur_line == NULL) { assert(p->tok->fp_interactive); // We can reach this point if the tokenizer buffers for interactive source have not been // initialized because we failed to decode the original source with the given locale. return PyUnicode_FromStringAndSize("", 0); } Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno; const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp; for (int i = 0; i < relative_lineno - 1; i++) { char *new_line = strchr(cur_line, '\n'); // The assert is here for debug builds but the conditional that // follows is there so in release builds we do not crash at the cost // to report a potentially wrong line. assert(new_line != NULL && new_line + 1 < buf_end); if (new_line == NULL || new_line + 1 > buf_end) { break; } cur_line = new_line + 1; } char *next_newline; if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line next_newline = cur_line + strlen(cur_line); } return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace"); } void * _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, Py_ssize_t lineno, Py_ssize_t col_offset, Py_ssize_t end_lineno, Py_ssize_t end_col_offset, const char *errmsg, va_list va) { PyObject *value = NULL; PyObject *errstr = NULL; PyObject *error_line = NULL; PyObject *tmp = NULL; p->error_indicator = 1; if (end_lineno == CURRENT_POS) { end_lineno = p->tok->lineno; } if (end_col_offset == CURRENT_POS) { end_col_offset = p->tok->cur - p->tok->line_start; } if (p->start_rule == Py_fstring_input) { const char *fstring_msg = "f-string: "; Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg); char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character if (!new_errmsg) { return (void *) PyErr_NoMemory(); } // Copy both strings into new buffer memcpy(new_errmsg, fstring_msg, strlen(fstring_msg)); memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg)); new_errmsg[len] = 0; errmsg = new_errmsg; } errstr = PyUnicode_FromFormatV(errmsg, va); if (!errstr) { goto error; } if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) { error_line = get_error_line_from_tokenizer_buffers(p, lineno); } else if (p->start_rule == Py_file_input) { error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename, (int) lineno, p->tok->encoding); } if (!error_line) { /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called, then we need to find the error line from some other source, because p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly failed or we're parsing from a string or the REPL. There's a third edge case where we're actually parsing from a file, which has an E_EOF SyntaxError and in that case `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which does not physically exist */ assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF); if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) { Py_ssize_t size = p->tok->inp - p->tok->buf; error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace"); } else if (p->tok->fp == NULL || p->tok->fp == stdin) { error_line = get_error_line_from_tokenizer_buffers(p, lineno); } else { error_line = PyUnicode_FromStringAndSize("", 0); } if (!error_line) { goto error; } } if (p->start_rule == Py_fstring_input) { col_offset -= p->starting_col_offset; end_col_offset -= p->starting_col_offset; } Py_ssize_t col_number = col_offset; Py_ssize_t end_col_number = end_col_offset; if (p->tok->encoding != NULL) { col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset); if (col_number < 0) { goto error; } if (end_col_number > 0) { Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number); if (end_col_offset < 0) { goto error; } else { end_col_number = end_col_offset; } } } tmp = Py_BuildValue("(OiiNii)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number); if (!tmp) { goto error; } value = PyTuple_Pack(2, errstr, tmp); Py_DECREF(tmp); if (!value) { goto error; } PyErr_SetObject(errtype, value); Py_DECREF(errstr); Py_DECREF(value); if (p->start_rule == Py_fstring_input) { PyMem_Free((void *)errmsg); } return NULL; error: Py_XDECREF(errstr); Py_XDECREF(error_line); if (p->start_rule == Py_fstring_input) { PyMem_Free((void *)errmsg); } return NULL; } void _Pypegen_set_syntax_error(Parser* p, Token* last_token) { // Existing sintax error if (PyErr_Occurred()) { // Prioritize tokenizer errors to custom syntax errors raised // on the second phase only if the errors come from the parser. int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK); if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) { _PyPegen_tokenize_full_source_to_check_for_errors(p); } // Propagate the existing syntax error. return; } // Initialization error if (p->fill == 0) { RAISE_SYNTAX_ERROR("error at start before reading any input"); } // Parser encountered EOF (End of File) unexpectedtly if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) { if (p->tok->level) { raise_unclosed_parentheses_error(p); } else { RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); } return; } // Indentation error in the tokenizer if (last_token->type == INDENT || last_token->type == DEDENT) { RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent"); return; } // Unknown error (generic case) // Use the last token we found on the first pass to avoid reporting // incorrect locations for generic syntax errors just because we reached // further away when trying to find specific syntax errors in the second // pass. RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax"); // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing // generic SyntaxError we just raised if errors are found. _PyPegen_tokenize_full_source_to_check_for_errors(p); }