diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 9e9656ced60..63d084df731 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -825,6 +825,16 @@ class TestDetectEncoding(TestCase): found, consumed_lines = detect_encoding(rl) self.assertEqual(found, "iso-8859-1") + def test_syntaxerror_latin1(self): + # Issue 14629: need to raise SyntaxError if the first + # line(s) have non-UTF-8 characters + lines = ( + b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S + ) + readline = self.get_readline(lines) + self.assertRaises(SyntaxError, detect_encoding, readline) + + def test_utf8_normalization(self): # See get_normal_name() in tokenizer.c. encodings = ("utf-8", "utf-8-mac", "utf-8-unix") diff --git a/Lib/tokenize.py b/Lib/tokenize.py index f575e9bc237..f283c6dd7f9 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -292,9 +292,12 @@ def detect_encoding(readline): def find_cookie(line): try: - line_string = line.decode('ascii') + # Decode as UTF-8. Either the line is an encoding declaration, + # in which case it should be pure ASCII, or it must be UTF-8 + # per default encoding. + line_string = line.decode('utf-8') except UnicodeDecodeError: - return None + raise SyntaxError("invalid or missing encoding declaration") matches = cookie_re.findall(line_string) if not matches: diff --git a/Misc/NEWS b/Misc/NEWS index 47d7b1097e0..addae1b54c6 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -47,6 +47,9 @@ Core and Builtins Library ------- +- Issue #14629: Raise SyntaxError in tokenizer.detect_encoding if the + first two lines have non-UTF-8 characters without an encoding declaration. + - Issue #14308: Fix an exception when a "dummy" thread is in the threading module's active list after a fork().