From 51c5896b6205911d29ac07f167ec7f3cf1cb600d Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Tue, 16 Jun 2020 16:49:43 +0100 Subject: [PATCH] bpo-40958: Avoid buffer overflow in the parser when indexing the current line (GH-20875) --- .../2020-06-15-01-20-44.bpo-40958.7O2Wh1.rst | 2 ++ Parser/pegen.c | 20 +++++++++---------- Parser/pegen.h | 4 ++-- 3 files changed, 13 insertions(+), 13 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2020-06-15-01-20-44.bpo-40958.7O2Wh1.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-06-15-01-20-44.bpo-40958.7O2Wh1.rst b/Misc/NEWS.d/next/Core and Builtins/2020-06-15-01-20-44.bpo-40958.7O2Wh1.rst new file mode 100644 index 00000000000..8e36897948f --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2020-06-15-01-20-44.bpo-40958.7O2Wh1.rst @@ -0,0 +1,2 @@ +Fix a possible buffer overflow in the PEG parser when gathering information +for emitting syntax errors. Patch by Pablo Galindo. diff --git a/Parser/pegen.c b/Parser/pegen.c index 4cff7342edb..e153e924e93 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -140,21 +140,18 @@ _create_dummy_identifier(Parser *p) } static inline Py_ssize_t -byte_offset_to_character_offset(PyObject *line, int col_offset) +byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset) { const char *str = PyUnicode_AsUTF8(line); if (!str) { return 0; } + assert(col_offset >= 0 && (unsigned long)col_offset <= strlen(str)); PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, "replace"); if (!text) { return 0; } Py_ssize_t size = PyUnicode_GET_LENGTH(text); - str = PyUnicode_AsUTF8(text); - if (str != NULL && (int)strlen(str) == col_offset) { - size = strlen(str); - } Py_DECREF(text); return size; } @@ -366,7 +363,7 @@ void * _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...) { Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; - int col_offset; + Py_ssize_t col_offset; if (t->col_offset == -1) { col_offset = Py_SAFE_DOWNCAST(p->tok->cur - p->tok->buf, intptr_t, int); @@ -386,7 +383,7 @@ _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...) void * _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, - int lineno, int col_offset, + Py_ssize_t lineno, Py_ssize_t col_offset, const char *errmsg, va_list va) { PyObject *value = NULL; @@ -406,16 +403,17 @@ _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, if (!error_line) { Py_ssize_t size = p->tok->inp - p->tok->buf; - if (size && p->tok->buf[size-1] == '\n') { - size--; - } error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace"); if (!error_line) { goto error; } } - Py_ssize_t col_number = byte_offset_to_character_offset(error_line, col_offset); + Py_ssize_t col_number = col_offset; + + if (p->tok->encoding != NULL) { + col_number = byte_offset_to_character_offset(error_line, col_offset); + } tmp = Py_BuildValue("(OiiN)", p->tok->filename, lineno, col_number, error_line); if (!tmp) { diff --git a/Parser/pegen.h b/Parser/pegen.h index 64cf0ec8929..c4ff8c9d512 100644 --- a/Parser/pegen.h +++ b/Parser/pegen.h @@ -34,7 +34,7 @@ typedef struct _memo { typedef struct { int type; PyObject *bytes; - int lineno, col_offset, end_lineno, end_col_offset; + Py_ssize_t lineno, col_offset, end_lineno, end_col_offset; Memo *memo; } Token; @@ -132,7 +132,7 @@ void *_PyPegen_string_token(Parser *p); const char *_PyPegen_get_expr_name(expr_ty); void *_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...); void *_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, - int lineno, int col_offset, + Py_ssize_t lineno, Py_ssize_t col_offset, const char *errmsg, va_list va); void *_PyPegen_dummy_name(Parser *p, ...);