normalize latin-1 and utf-8 variant encodings like the builtin tokenizer does
This commit is contained in:
parent
ffc08fcad6
commit
d3afadaa49
|
@ -719,7 +719,7 @@ class TestDetectEncoding(TestCase):
|
|||
b'do_something(else)\n'
|
||||
)
|
||||
encoding, consumed_lines = detect_encoding(self.get_readline(lines))
|
||||
self.assertEquals(encoding, 'latin-1')
|
||||
self.assertEquals(encoding, 'iso-8859-1')
|
||||
self.assertEquals(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
|
||||
|
||||
def test_matched_bom_and_cookie_first_line(self):
|
||||
|
@ -775,6 +775,34 @@ class TestDetectEncoding(TestCase):
|
|||
readline = self.get_readline(lines)
|
||||
self.assertRaises(SyntaxError, detect_encoding, readline)
|
||||
|
||||
def test_latin1_normalization(self):
|
||||
# See get_normal_name() in tokenizer.c.
|
||||
encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
|
||||
"iso-8859-1-unix", "iso-latin-1-mac")
|
||||
for encoding in encodings:
|
||||
for rep in ("-", "_"):
|
||||
enc = encoding.replace("-", rep)
|
||||
lines = (b"#!/usr/bin/python\n",
|
||||
b"# coding: " + enc.encode("ascii") + b"\n",
|
||||
b"print(things)\n",
|
||||
b"do_something += 4\n")
|
||||
rl = self.get_readline(lines)
|
||||
found, consumed_lines = detect_encoding(rl)
|
||||
self.assertEquals(found, "iso-8859-1")
|
||||
|
||||
def test_utf8_normalization(self):
|
||||
# See get_normal_name() in tokenizer.c.
|
||||
encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
|
||||
for encoding in encodings:
|
||||
for rep in ("-", "_"):
|
||||
enc = encoding.replace("-", rep)
|
||||
lines = (b"#!/usr/bin/python\n",
|
||||
b"# coding: " + enc.encode("ascii") + b"\n",
|
||||
b"1 + 3\n")
|
||||
rl = self.get_readline(lines)
|
||||
found, consumed_lines = detect_encoding(rl)
|
||||
self.assertEquals(found, "utf-8")
|
||||
|
||||
def test_short_files(self):
|
||||
readline = self.get_readline((b'print(something)\n',))
|
||||
encoding, consumed_lines = detect_encoding(readline)
|
||||
|
|
|
@ -279,6 +279,17 @@ def untokenize(iterable):
|
|||
return out
|
||||
|
||||
|
||||
def _get_normal_name(orig_enc):
|
||||
"""Imitates get_normal_name in tokenizer.c."""
|
||||
# Only care about the first 12 characters.
|
||||
enc = orig_enc[:12].lower().replace("_", "-")
|
||||
if enc == "utf-8" or enc.startswith("utf-8-"):
|
||||
return "utf-8"
|
||||
if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
|
||||
enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
|
||||
return "iso-8859-1"
|
||||
return orig_enc
|
||||
|
||||
def detect_encoding(readline):
|
||||
"""
|
||||
The detect_encoding() function is used to detect the encoding that should
|
||||
|
@ -313,7 +324,7 @@ def detect_encoding(readline):
|
|||
matches = cookie_re.findall(line_string)
|
||||
if not matches:
|
||||
return None
|
||||
encoding = matches[0]
|
||||
encoding = _get_normal_name(matches[0])
|
||||
try:
|
||||
codec = lookup(encoding)
|
||||
except LookupError:
|
||||
|
|
Loading…
Reference in New Issue