From c33f3f2339fd3217a0c6fe3df916616abab2fab4 Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Fri, 20 Apr 2012 13:23:54 -0400 Subject: [PATCH] Issue #14629: Mention the filename in SyntaxError exceptions from tokenizer.detect_encoding() (when available). --- Lib/test/test_tokenize.py | 29 +++++++++++++++++++++++++++++ Lib/tokenize.py | 22 +++++++++++++++++++--- Misc/NEWS | 3 +++ 3 files changed, 51 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 11590ead86d..915eda9c9ae 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -904,6 +904,35 @@ class TestDetectEncoding(TestCase): self.assertEqual(fp.encoding, 'utf-8-sig') self.assertEqual(fp.mode, 'r') + def test_filename_in_exception(self): + # When possible, include the file name in the exception. + path = 'some_file_path' + lines = ( + b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S + ) + class Bunk: + def __init__(self, lines, path): + self.name = path + self._lines = lines + self._index = 0 + + def readline(self): + if self._index == len(lines): + raise StopIteration + line = lines[self._index] + self._index += 1 + return line + + with self.assertRaises(SyntaxError): + ins = Bunk(lines, path) + # Make sure lacking a name isn't an issue. + del ins.name + detect_encoding(ins.readline) + with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)): + ins = Bunk(lines, path) + detect_encoding(ins.readline) + + class TestTokenize(TestCase): def test_tokenize(self): diff --git a/Lib/tokenize.py b/Lib/tokenize.py index c05f764dd86..e4c9d3c0845 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -353,6 +353,10 @@ def detect_encoding(readline): If no encoding is specified, then the default of 'utf-8' will be returned. """ + try: + filename = readline.__self__.name + except AttributeError: + filename = None bom_found = False encoding = None default = 'utf-8' @@ -369,7 +373,10 @@ def detect_encoding(readline): # per default encoding. line_string = line.decode('utf-8') except UnicodeDecodeError: - raise SyntaxError("invalid or missing encoding declaration") + msg = "invalid or missing encoding declaration" + if filename is not None: + msg = '{} for {!r}'.format(msg, filename) + raise SyntaxError(msg) matches = cookie_re.findall(line_string) if not matches: @@ -379,12 +386,21 @@ def detect_encoding(readline): codec = lookup(encoding) except LookupError: # This behaviour mimics the Python interpreter - raise SyntaxError("unknown encoding: " + encoding) + if filename is None: + msg = "unknown encoding: " + encoding + else: + msg = "unknown encoding for {!r}: {}".format(filename, + encoding) + raise SyntaxError(msg) if bom_found: if codec.name != 'utf-8': # This behaviour mimics the Python interpreter - raise SyntaxError('encoding problem: utf-8') + if filename is None: + msg = 'encoding problem: utf-8' + else: + msg = 'encoding problem for {!r}: utf-8'.format(filename) + raise SyntaxError(msg) encoding += '-sig' return encoding diff --git a/Misc/NEWS b/Misc/NEWS index b98cdca6002..6d3410f177a 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -55,6 +55,9 @@ Core and Builtins Library ------- +- Issue #14629: tokenizer.detect_encoding will specify the filename in the + SyntaxError exception if found at readline.__self__.name. + - Issue #14629: Raise SyntaxError in tokenizer.detect_encoding if the first two lines have non-UTF-8 characters without an encoding declaration.