gh-107450: Check for overflow in the tokenizer and fix overflow test (#110832)

Co-authored-by: Filipe Laíns <lains@riseup.net>
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
Lysandros Nikolaou 2023-10-16 16:42:49 +02:00 committed by GitHub
parent b3c9faf056
commit a1ac5590e0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 40 additions and 22 deletions

View File

@ -19,24 +19,25 @@
extern "C" { extern "C" {
#endif #endif
#define E_OK 10 /* No error */ #define E_OK 10 /* No error */
#define E_EOF 11 /* End Of File */ #define E_EOF 11 /* End Of File */
#define E_INTR 12 /* Interrupted */ #define E_INTR 12 /* Interrupted */
#define E_TOKEN 13 /* Bad token */ #define E_TOKEN 13 /* Bad token */
#define E_SYNTAX 14 /* Syntax error */ #define E_SYNTAX 14 /* Syntax error */
#define E_NOMEM 15 /* Ran out of memory */ #define E_NOMEM 15 /* Ran out of memory */
#define E_DONE 16 /* Parsing complete */ #define E_DONE 16 /* Parsing complete */
#define E_ERROR 17 /* Execution error */ #define E_ERROR 17 /* Execution error */
#define E_TABSPACE 18 /* Inconsistent mixing of tabs and spaces */ #define E_TABSPACE 18 /* Inconsistent mixing of tabs and spaces */
#define E_OVERFLOW 19 /* Node had too many children */ #define E_OVERFLOW 19 /* Node had too many children */
#define E_TOODEEP 20 /* Too many indentation levels */ #define E_TOODEEP 20 /* Too many indentation levels */
#define E_DEDENT 21 /* No matching outer block for dedent */ #define E_DEDENT 21 /* No matching outer block for dedent */
#define E_DECODE 22 /* Error in decoding into Unicode */ #define E_DECODE 22 /* Error in decoding into Unicode */
#define E_EOFS 23 /* EOF in triple-quoted string */ #define E_EOFS 23 /* EOF in triple-quoted string */
#define E_EOLS 24 /* EOL in single-quoted string */ #define E_EOLS 24 /* EOL in single-quoted string */
#define E_LINECONT 25 /* Unexpected characters after a line continuation */ #define E_LINECONT 25 /* Unexpected characters after a line continuation */
#define E_BADSINGLE 27 /* Ill-formed single statement input */ #define E_BADSINGLE 27 /* Ill-formed single statement input */
#define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */ #define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */
#define E_COLUMNOVERFLOW 29 /* Column offset overflow */
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -18,6 +18,12 @@ from test.support.os_helper import TESTFN, unlink
from test.support.warnings_helper import check_warnings from test.support.warnings_helper import check_warnings
from test import support from test import support
try:
from _testcapi import INT_MAX
except ImportError:
INT_MAX = 2**31 - 1
class NaiveException(Exception): class NaiveException(Exception):
def __init__(self, x): def __init__(self, x):
@ -318,11 +324,13 @@ class ExceptionTests(unittest.TestCase):
check('(yield i) = 2', 1, 2) check('(yield i) = 2', 1, 2)
check('def f(*):\n pass', 1, 7) check('def f(*):\n pass', 1, 7)
@unittest.skipIf(INT_MAX >= sys.maxsize, "Downcasting to int is safe for col_offset")
@support.requires_resource('cpu') @support.requires_resource('cpu')
@support.bigmemtest(support._2G, memuse=1.5) @support.bigmemtest(INT_MAX, memuse=2, dry_run=False)
def testMemoryErrorBigSource(self, _size): def testMemoryErrorBigSource(self, size):
with self.assertRaises(OverflowError): src = b"if True:\n%*s" % (size, b"pass")
exec(f"if True:\n {' ' * 2**31}print('hello world')") with self.assertRaisesRegex(OverflowError, "Parser column offset overflow"):
compile(src, '<fragment>', 'exec')
@cpython_only @cpython_only
def testSettingException(self): def testSettingException(self):

View File

@ -59,6 +59,10 @@ tok_nextc(struct tok_state *tok)
int rc; int rc;
for (;;) { for (;;) {
if (tok->cur != tok->inp) { if (tok->cur != tok->inp) {
if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) {
tok->done = E_COLUMNOVERFLOW;
return EOF;
}
tok->col_offset++; tok->col_offset++;
return Py_CHARMASK(*tok->cur++); /* Fast path */ return Py_CHARMASK(*tok->cur++); /* Fast path */
} }

View File

@ -68,6 +68,7 @@ _Pypegen_tokenizer_error(Parser *p)
const char *msg = NULL; const char *msg = NULL;
PyObject* errtype = PyExc_SyntaxError; PyObject* errtype = PyExc_SyntaxError;
Py_ssize_t col_offset = -1; Py_ssize_t col_offset = -1;
p->error_indicator = 1;
switch (p->tok->done) { switch (p->tok->done) {
case E_TOKEN: case E_TOKEN:
msg = "invalid token"; msg = "invalid token";
@ -103,6 +104,10 @@ _Pypegen_tokenizer_error(Parser *p)
msg = "unexpected character after line continuation character"; msg = "unexpected character after line continuation character";
break; break;
} }
case E_COLUMNOVERFLOW:
PyErr_SetString(PyExc_OverflowError,
"Parser column offset overflow - source line is too big");
return -1;
default: default:
msg = "unknown parsing error"; msg = "unknown parsing error";
} }