From d73aca769f1f6eebb46faa9161cbebe806db3659 Mon Sep 17 00:00:00 2001 From: Benjamin Peterson Date: Tue, 21 Apr 2015 12:05:19 -0400 Subject: [PATCH] do not call into python api if an exception is set (#24022) --- Lib/test/test_compile.py | 14 +++++++++++++- Misc/NEWS | 2 ++ Parser/tokenizer.c | 7 +++---- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_compile.py b/Lib/test/test_compile.py index 611667690fe..cff3c9ea0b5 100644 --- a/Lib/test/test_compile.py +++ b/Lib/test/test_compile.py @@ -1,9 +1,11 @@ import math +import os import unittest import sys import _ast +import tempfile import types -from test import support +from test import support, script_helper class TestSpecifics(unittest.TestCase): @@ -492,6 +494,16 @@ if 1: self.assertInvalidSingle('f()\nxy # blah\nblah()') self.assertInvalidSingle('x = 5 # comment\nx = 6\n') + def test_particularly_evil_undecodable(self): + # Issue 24022 + src = b'0000\x00\n00000000000\n\x00\n\x9e\n' + with tempfile.TemporaryDirectory() as tmpd: + fn = os.path.join(tmpd, "bad.py") + with open(fn, "wb") as fp: + fp.write(src) + res = script_helper.run_python_until_end(fn)[0] + self.assertIn(b"Non-UTF-8", res.err) + @support.cpython_only def test_compiler_recursion_limit(self): # Expected limit is sys.getrecursionlimit() * the scaling factor diff --git a/Misc/NEWS b/Misc/NEWS index a6a3d822dbd..183f7d19ca6 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,8 @@ Release date: tba Core and Builtins ----------------- +- Issue #24022: Fix tokenizer crash when processing undecodable source code. + - Issue #23309: Avoid a deadlock at shutdown if a daemon thread is aborted while it is holding a lock to a buffered I/O object, and the main thread tries to use the same I/O object (typically stdout or stderr). A fatal diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 22accd1061a..5e041ea5b30 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1301,6 +1301,8 @@ verify_identifier(struct tok_state *tok) { PyObject *s; int result; + if (tok->decoding_erred) + return 0; s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); if (s == NULL || PyUnicode_READY(s) == -1) { if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { @@ -1469,11 +1471,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) c = tok_nextc(tok); } tok_backup(tok, c); - if (nonascii && - !verify_identifier(tok)) { - tok->done = E_IDENTIFIER; + if (nonascii && !verify_identifier(tok)) return ERRORTOKEN; - } *p_start = tok->start; *p_end = tok->cur; return NAME;