raise a SyntaxError in detect_encoding() when a codec lookup fails like the builtin parser #4021
This commit is contained in:
parent
e675f08e03
commit
433f32c3be
|
@ -795,6 +795,8 @@ class TestDetectEncoding(TestCase):
|
|||
self.assertEquals(encoding, 'utf-8')
|
||||
self.assertEquals(consumed_lines, [])
|
||||
|
||||
readline = self.get_readline((b'# coding: bad\n',))
|
||||
self.assertRaises(SyntaxError, detect_encoding, readline)
|
||||
|
||||
class TestTokenize(TestCase):
|
||||
|
||||
|
|
|
@ -26,7 +26,7 @@ __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
|
|||
|
||||
import re, string, sys
|
||||
from token import *
|
||||
from codecs import lookup
|
||||
from codecs import lookup, BOM_UTF8
|
||||
from itertools import chain, repeat
|
||||
cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
|
||||
|
||||
|
@ -251,11 +251,11 @@ def detect_encoding(readline):
|
|||
|
||||
It detects the encoding from the presence of a utf-8 bom or an encoding
|
||||
cookie as specified in pep-0263. If both a bom and a cookie are present,
|
||||
but disagree, a SyntaxError will be raised.
|
||||
but disagree, a SyntaxError will be raised. If the encoding cookie is an
|
||||
invalid charset, raise a SyntaxError.
|
||||
|
||||
If no encoding is specified, then the default of 'utf-8' will be returned.
|
||||
"""
|
||||
utf8_bom = b'\xef\xbb\xbf'
|
||||
bom_found = False
|
||||
encoding = None
|
||||
def read_or_stop():
|
||||
|
@ -268,18 +268,25 @@ def detect_encoding(readline):
|
|||
try:
|
||||
line_string = line.decode('ascii')
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
else:
|
||||
matches = cookie_re.findall(line_string)
|
||||
if matches:
|
||||
encoding = matches[0]
|
||||
if bom_found and lookup(encoding).name != 'utf-8':
|
||||
# This behaviour mimics the Python interpreter
|
||||
raise SyntaxError('encoding problem: utf-8')
|
||||
return encoding
|
||||
return None
|
||||
|
||||
matches = cookie_re.findall(line_string)
|
||||
if not matches:
|
||||
return None
|
||||
encoding = matches[0]
|
||||
try:
|
||||
codec = lookup(encoding)
|
||||
except LookupError:
|
||||
# This behaviour mimics the Python interpreter
|
||||
raise SyntaxError("unknown encoding: " + encoding)
|
||||
|
||||
if bom_found and codec.name != 'utf-8':
|
||||
# This behaviour mimics the Python interpreter
|
||||
raise SyntaxError('encoding problem: utf-8')
|
||||
return encoding
|
||||
|
||||
first = read_or_stop()
|
||||
if first.startswith(utf8_bom):
|
||||
if first.startswith(BOM_UTF8):
|
||||
bom_found = True
|
||||
first = first[3:]
|
||||
if not first:
|
||||
|
|
|
@ -45,6 +45,9 @@ Core and Builtins
|
|||
Library
|
||||
-------
|
||||
|
||||
- Issue #4021: tokenize.detect_encoding() now raises a SyntaxError when the
|
||||
codec cannot be found. This is for compatibility with the builtin behavior.
|
||||
|
||||
- Issue #4084: Fix max, min, max_mag and min_mag Decimal methods to
|
||||
give correct results in the case where one argument is a quiet NaN
|
||||
and the other is a finite number that requires rounding.
|
||||
|
|
Loading…
Reference in New Issue