Issue #14629: Mention the filename in SyntaxError exceptions from
tokenizer.detect_encoding() (when available).
This commit is contained in:
parent
dd9a56953e
commit
c33f3f2339
|
@ -904,6 +904,35 @@ class TestDetectEncoding(TestCase):
|
||||||
self.assertEqual(fp.encoding, 'utf-8-sig')
|
self.assertEqual(fp.encoding, 'utf-8-sig')
|
||||||
self.assertEqual(fp.mode, 'r')
|
self.assertEqual(fp.mode, 'r')
|
||||||
|
|
||||||
|
def test_filename_in_exception(self):
|
||||||
|
# When possible, include the file name in the exception.
|
||||||
|
path = 'some_file_path'
|
||||||
|
lines = (
|
||||||
|
b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
|
||||||
|
)
|
||||||
|
class Bunk:
|
||||||
|
def __init__(self, lines, path):
|
||||||
|
self.name = path
|
||||||
|
self._lines = lines
|
||||||
|
self._index = 0
|
||||||
|
|
||||||
|
def readline(self):
|
||||||
|
if self._index == len(lines):
|
||||||
|
raise StopIteration
|
||||||
|
line = lines[self._index]
|
||||||
|
self._index += 1
|
||||||
|
return line
|
||||||
|
|
||||||
|
with self.assertRaises(SyntaxError):
|
||||||
|
ins = Bunk(lines, path)
|
||||||
|
# Make sure lacking a name isn't an issue.
|
||||||
|
del ins.name
|
||||||
|
detect_encoding(ins.readline)
|
||||||
|
with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
|
||||||
|
ins = Bunk(lines, path)
|
||||||
|
detect_encoding(ins.readline)
|
||||||
|
|
||||||
|
|
||||||
class TestTokenize(TestCase):
|
class TestTokenize(TestCase):
|
||||||
|
|
||||||
def test_tokenize(self):
|
def test_tokenize(self):
|
||||||
|
|
|
@ -353,6 +353,10 @@ def detect_encoding(readline):
|
||||||
|
|
||||||
If no encoding is specified, then the default of 'utf-8' will be returned.
|
If no encoding is specified, then the default of 'utf-8' will be returned.
|
||||||
"""
|
"""
|
||||||
|
try:
|
||||||
|
filename = readline.__self__.name
|
||||||
|
except AttributeError:
|
||||||
|
filename = None
|
||||||
bom_found = False
|
bom_found = False
|
||||||
encoding = None
|
encoding = None
|
||||||
default = 'utf-8'
|
default = 'utf-8'
|
||||||
|
@ -369,7 +373,10 @@ def detect_encoding(readline):
|
||||||
# per default encoding.
|
# per default encoding.
|
||||||
line_string = line.decode('utf-8')
|
line_string = line.decode('utf-8')
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
raise SyntaxError("invalid or missing encoding declaration")
|
msg = "invalid or missing encoding declaration"
|
||||||
|
if filename is not None:
|
||||||
|
msg = '{} for {!r}'.format(msg, filename)
|
||||||
|
raise SyntaxError(msg)
|
||||||
|
|
||||||
matches = cookie_re.findall(line_string)
|
matches = cookie_re.findall(line_string)
|
||||||
if not matches:
|
if not matches:
|
||||||
|
@ -379,12 +386,21 @@ def detect_encoding(readline):
|
||||||
codec = lookup(encoding)
|
codec = lookup(encoding)
|
||||||
except LookupError:
|
except LookupError:
|
||||||
# This behaviour mimics the Python interpreter
|
# This behaviour mimics the Python interpreter
|
||||||
raise SyntaxError("unknown encoding: " + encoding)
|
if filename is None:
|
||||||
|
msg = "unknown encoding: " + encoding
|
||||||
|
else:
|
||||||
|
msg = "unknown encoding for {!r}: {}".format(filename,
|
||||||
|
encoding)
|
||||||
|
raise SyntaxError(msg)
|
||||||
|
|
||||||
if bom_found:
|
if bom_found:
|
||||||
if codec.name != 'utf-8':
|
if codec.name != 'utf-8':
|
||||||
# This behaviour mimics the Python interpreter
|
# This behaviour mimics the Python interpreter
|
||||||
raise SyntaxError('encoding problem: utf-8')
|
if filename is None:
|
||||||
|
msg = 'encoding problem: utf-8'
|
||||||
|
else:
|
||||||
|
msg = 'encoding problem for {!r}: utf-8'.format(filename)
|
||||||
|
raise SyntaxError(msg)
|
||||||
encoding += '-sig'
|
encoding += '-sig'
|
||||||
return encoding
|
return encoding
|
||||||
|
|
||||||
|
|
|
@ -55,6 +55,9 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #14629: tokenizer.detect_encoding will specify the filename in the
|
||||||
|
SyntaxError exception if found at readline.__self__.name.
|
||||||
|
|
||||||
- Issue #14629: Raise SyntaxError in tokenizer.detect_encoding if the
|
- Issue #14629: Raise SyntaxError in tokenizer.detect_encoding if the
|
||||||
first two lines have non-UTF-8 characters without an encoding declaration.
|
first two lines have non-UTF-8 characters without an encoding declaration.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue