From d3afadaa4908df544e0181c11199e59b1bfb5c37 Mon Sep 17 00:00:00 2001 From: Benjamin Peterson Date: Fri, 9 Oct 2009 21:43:09 +0000 Subject: [PATCH] normalize latin-1 and utf-8 variant encodings like the builtin tokenizer does --- Lib/test/test_tokenize.py | 30 +++++++++++++++++++++++++++++- Lib/tokenize.py | 13 ++++++++++++- Misc/NEWS | 3 +++ 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index f395ed43c99..ba705bac7ac 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -719,7 +719,7 @@ class TestDetectEncoding(TestCase): b'do_something(else)\n' ) encoding, consumed_lines = detect_encoding(self.get_readline(lines)) - self.assertEquals(encoding, 'latin-1') + self.assertEquals(encoding, 'iso-8859-1') self.assertEquals(consumed_lines, [b'# -*- coding: latin-1 -*-\n']) def test_matched_bom_and_cookie_first_line(self): @@ -775,6 +775,34 @@ class TestDetectEncoding(TestCase): readline = self.get_readline(lines) self.assertRaises(SyntaxError, detect_encoding, readline) + def test_latin1_normalization(self): + # See get_normal_name() in tokenizer.c. + encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix", + "iso-8859-1-unix", "iso-latin-1-mac") + for encoding in encodings: + for rep in ("-", "_"): + enc = encoding.replace("-", rep) + lines = (b"#!/usr/bin/python\n", + b"# coding: " + enc.encode("ascii") + b"\n", + b"print(things)\n", + b"do_something += 4\n") + rl = self.get_readline(lines) + found, consumed_lines = detect_encoding(rl) + self.assertEquals(found, "iso-8859-1") + + def test_utf8_normalization(self): + # See get_normal_name() in tokenizer.c. + encodings = ("utf-8", "utf-8-mac", "utf-8-unix") + for encoding in encodings: + for rep in ("-", "_"): + enc = encoding.replace("-", rep) + lines = (b"#!/usr/bin/python\n", + b"# coding: " + enc.encode("ascii") + b"\n", + b"1 + 3\n") + rl = self.get_readline(lines) + found, consumed_lines = detect_encoding(rl) + self.assertEquals(found, "utf-8") + def test_short_files(self): readline = self.get_readline((b'print(something)\n',)) encoding, consumed_lines = detect_encoding(readline) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index f83bda522a6..fb58c6b77a0 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -279,6 +279,17 @@ def untokenize(iterable): return out +def _get_normal_name(orig_enc): + """Imitates get_normal_name in tokenizer.c.""" + # Only care about the first 12 characters. + enc = orig_enc[:12].lower().replace("_", "-") + if enc == "utf-8" or enc.startswith("utf-8-"): + return "utf-8" + if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ + enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): + return "iso-8859-1" + return orig_enc + def detect_encoding(readline): """ The detect_encoding() function is used to detect the encoding that should @@ -313,7 +324,7 @@ def detect_encoding(readline): matches = cookie_re.findall(line_string) if not matches: return None - encoding = matches[0] + encoding = _get_normal_name(matches[0]) try: codec = lookup(encoding) except LookupError: diff --git a/Misc/NEWS b/Misc/NEWS index 61f91ed0caf..f542bcb30fd 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -87,6 +87,9 @@ C-API Library ------- +- Make tokenize.detect_coding() normalize utf-8 and iso-8859-1 variants like the + builtin tokenizer. + - Issue #7048: Force Decimal.logb to round its result when that result is too large to fit in the current precision.