From c5b049b91ca50c615f9a5425055c2b79a82ac547 Mon Sep 17 00:00:00 2001 From: Hai Shi Date: Wed, 14 Oct 2020 23:43:31 +0800 Subject: [PATCH] bpo-39337: encodings.normalize_encoding() now ignores non-ASCII characters (GH-22219) --- Doc/whatsnew/3.10.rst | 5 +++++ Lib/encodings/__init__.py | 3 ++- Lib/test/test_codecs.py | 14 +++++++++++++- .../2020-09-13-02-02-18.bpo-39337.L3NXTt.rst | 1 + 4 files changed, 21 insertions(+), 2 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst diff --git a/Doc/whatsnew/3.10.rst b/Doc/whatsnew/3.10.rst index c8ddcd2d242..738ef974e78 100644 --- a/Doc/whatsnew/3.10.rst +++ b/Doc/whatsnew/3.10.rst @@ -186,6 +186,11 @@ by :func:`curses.color_content`, :func:`curses.init_color`, support is provided by the underlying ncurses library. (Contributed by Jeffrey Kintscher and Hans Petter Jansson in :issue:`36982`.) +encodings +--------- +:func:`encodings.normalize_encoding` now ignores non-ASCII characters. +(Contributed by Hai Shi in :issue:`39337`.) + glob ---- diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index ddd5afdcf2d..4b37d3321c9 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -61,7 +61,8 @@ def normalize_encoding(encoding): if c.isalnum() or c == '.': if punct and chars: chars.append('_') - chars.append(c) + if c.isascii(): + chars.append(c) punct = False else: punct = True diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index ddf4e08af62..09ceef76eb0 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3417,7 +3417,7 @@ class Rot13UtilTest(unittest.TestCase): class CodecNameNormalizationTest(unittest.TestCase): """Test codec name normalization""" - def test_normalized_encoding(self): + def test_codecs_lookup(self): FOUND = (1, 2, 3, 4) NOT_FOUND = (None, None, None, None) def search_function(encoding): @@ -3439,6 +3439,18 @@ class CodecNameNormalizationTest(unittest.TestCase): self.assertEqual(NOT_FOUND, codecs.lookup('BBB.8')) self.assertEqual(NOT_FOUND, codecs.lookup('a\xe9\u20ac-8')) + def test_encodings_normalize_encoding(self): + # encodings.normalize_encoding() ignores non-ASCII characters. + normalize = encodings.normalize_encoding + self.assertEqual(normalize('utf_8'), 'utf_8') + self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8') + self.assertEqual(normalize('utf 8'), 'utf_8') + # encodings.normalize_encoding() doesn't convert + # characters to lower case. + self.assertEqual(normalize('UTF 8'), 'UTF_8') + self.assertEqual(normalize('utf.8'), 'utf.8') + self.assertEqual(normalize('utf...8'), 'utf...8') + if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst b/Misc/NEWS.d/next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst new file mode 100644 index 00000000000..c2b4dbe4d12 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst @@ -0,0 +1 @@ +:func:`encodings.normalize_encoding` now ignores non-ASCII characters.