normalize latin-1 and utf-8 variant encodings like the builtin tokenizer does

This commit is contained in:
Benjamin Peterson 2009-10-09 21:43:09 +00:00
parent ffc08fcad6
commit d3afadaa49
3 changed files with 44 additions and 2 deletions

View File

@ -719,7 +719,7 @@ class TestDetectEncoding(TestCase):
b'do_something(else)\n'
)
encoding, consumed_lines = detect_encoding(self.get_readline(lines))
self.assertEquals(encoding, 'latin-1')
self.assertEquals(encoding, 'iso-8859-1')
self.assertEquals(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
def test_matched_bom_and_cookie_first_line(self):
@ -775,6 +775,34 @@ class TestDetectEncoding(TestCase):
readline = self.get_readline(lines)
self.assertRaises(SyntaxError, detect_encoding, readline)
def test_latin1_normalization(self):
# See get_normal_name() in tokenizer.c.
encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
"iso-8859-1-unix", "iso-latin-1-mac")
for encoding in encodings:
for rep in ("-", "_"):
enc = encoding.replace("-", rep)
lines = (b"#!/usr/bin/python\n",
b"# coding: " + enc.encode("ascii") + b"\n",
b"print(things)\n",
b"do_something += 4\n")
rl = self.get_readline(lines)
found, consumed_lines = detect_encoding(rl)
self.assertEquals(found, "iso-8859-1")
def test_utf8_normalization(self):
# See get_normal_name() in tokenizer.c.
encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
for encoding in encodings:
for rep in ("-", "_"):
enc = encoding.replace("-", rep)
lines = (b"#!/usr/bin/python\n",
b"# coding: " + enc.encode("ascii") + b"\n",
b"1 + 3\n")
rl = self.get_readline(lines)
found, consumed_lines = detect_encoding(rl)
self.assertEquals(found, "utf-8")
def test_short_files(self):
readline = self.get_readline((b'print(something)\n',))
encoding, consumed_lines = detect_encoding(readline)

View File

@ -279,6 +279,17 @@ def untokenize(iterable):
return out
def _get_normal_name(orig_enc):
"""Imitates get_normal_name in tokenizer.c."""
# Only care about the first 12 characters.
enc = orig_enc[:12].lower().replace("_", "-")
if enc == "utf-8" or enc.startswith("utf-8-"):
return "utf-8"
if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
return "iso-8859-1"
return orig_enc
def detect_encoding(readline):
"""
The detect_encoding() function is used to detect the encoding that should
@ -313,7 +324,7 @@ def detect_encoding(readline):
matches = cookie_re.findall(line_string)
if not matches:
return None
encoding = matches[0]
encoding = _get_normal_name(matches[0])
try:
codec = lookup(encoding)
except LookupError:

View File

@ -87,6 +87,9 @@ C-API
Library
-------
- Make tokenize.detect_coding() normalize utf-8 and iso-8859-1 variants like the
builtin tokenizer.
- Issue #7048: Force Decimal.logb to round its result when that result
is too large to fit in the current precision.