Issue #5815: Fixed support for locales with modifiers. Fixed support for

locale encodings with hyphens.
2013-12-19 21:21:40 +02:00 · 2013-12-19 21:21:40 +02:00 · 16f02d2b06
parent ed0b87d73c
commit 16f02d2b06
2 changed files with 131 additions and 41 deletions
--- a/Lib/locale.py
+++ b/Lib/locale.py
@ -336,6 +336,22 @@ def _test():
 # overridden below)
 _setlocale = setlocale
 def _replace_encoding(code, encoding):
    if '.' in code:
        langname = code[:code.index('.')]
    else:
        langname = code
    # Convert the encoding to a C lib compatible encoding string
    norm_encoding = encodings.normalize_encoding(encoding)
    #print('norm encoding: %r' % norm_encoding)
    norm_encoding = encodings.aliases.aliases.get(norm_encoding,
                                                  norm_encoding)
    #print('aliased encoding: %r' % norm_encoding)
    encoding = locale_encoding_alias.get(norm_encoding,
                                         norm_encoding)
    #print('found encoding %r' % encoding)
    return langname + '.' + encoding
 def normalize(localename):
    """ Returns a normalized locale code for the given locale
@ -352,55 +368,71 @@ def normalize(localename):
        does.
    """
-    # Normalize the locale name and extract the encoding
+    # Normalize the locale name and extract the encoding and modifier
-    fullname = localename.lower()
+    code = localename.lower()
-    if ':' in fullname:
+    if ':' in code:
        # ':' is sometimes used as encoding delimiter.
-        fullname = fullname.replace(':', '.')
+        code = code.replace(':', '.')
-    if '.' in fullname:
+    if '@' in code:
-        langname, encoding = fullname.split('.')[:2]
+        code, modifier = code.split('@', 1)
        fullname = langname + '.' + encoding
    else:
-        langname = fullname
+        modifier = ''
    if '.' in code:
        langname, encoding = code.split('.')[:2]
    else:
        langname = code
        encoding = ''
-    # First lookup: fullname (possibly with encoding)
+    # First lookup: fullname (possibly with encoding and modifier)
-    norm_encoding = encoding.replace('-', '')
+    lang_enc = langname
-    norm_encoding = norm_encoding.replace('_', '')
+    if encoding:
-    lookup_name = langname + '.' + encoding
+        norm_encoding = encoding.replace('-', '')
        norm_encoding = norm_encoding.replace('_', '')
        lang_enc += '.' + norm_encoding
    lookup_name = lang_enc
    if modifier:
        lookup_name += '@' + modifier
    code = locale_alias.get(lookup_name, None)
    if code is not None:
        return code
-    #print 'first lookup failed'
+    #print('first lookup failed')
-    # Second try: langname (without encoding)
+    if modifier:
-    code = locale_alias.get(langname, None)
+        # Second try: fullname without modifier (possibly with encoding)
-    if code is not None:
+        code = locale_alias.get(lang_enc, None)
-        #print 'langname lookup succeeded'
+        if code is not None:
-        if '.' in code:
+            #print('lookup without modifier succeeded')
-            langname, defenc = code.split('.')
+            if '@' not in code:
-        else:
+                return code + '@' + modifier
-            langname = code
+            if code.split('@', 1)[1].lower() == modifier:
-            defenc = ''
+                return code
-        if encoding:
+        #print('second lookup failed')
            # Convert the encoding to a C lib compatible encoding string
            norm_encoding = encodings.normalize_encoding(encoding)
            #print 'norm encoding: %r' % norm_encoding
            norm_encoding = encodings.aliases.aliases.get(norm_encoding,
                                                          norm_encoding)
            #print 'aliased encoding: %r' % norm_encoding
            encoding = locale_encoding_alias.get(norm_encoding,
                                                 norm_encoding)
        else:
            encoding = defenc
        #print 'found encoding %r' % encoding
        if encoding:
            return langname + '.' + encoding
        else:
            return langname
-    else:
+    if encoding:
-        return localename
+        # Third try: langname (without encoding, possibly with modifier)
        lookup_name = langname
        if modifier:
            lookup_name += '@' + modifier
        code = locale_alias.get(lookup_name, None)
        if code is not None:
            #print('lookup without encoding succeeded')
            if '@' not in code:
                return _replace_encoding(code, encoding)
            code, modifier = code.split('@', 1)
            return _replace_encoding(code, encoding) + '@' + modifier
        if modifier:
            # Fourth try: langname (without encoding and modifier)
            code = locale_alias.get(langname, None)
            if code is not None:
                #print('lookup without modifier and encoding succeeded')
                if '@' not in code:
                    return _replace_encoding(code, encoding) + '@' + modifier
                code, defmod = code.split('@', 1)
                if defmod.lower() == modifier:
                    return _replace_encoding(code, encoding) + '@' + defmod
    return localename
 def _parse_localename(localename):
@ -419,7 +451,7 @@ def _parse_localename(localename):
    code = normalize(localename)
    if '@' in code:
        # Deal with locale modifiers
-        code, modifier = code.split('@')
+        code, modifier = code.split('@', 1)
        if modifier == 'euro' and '.' not in code:
            # Assume Latin-9 for @euro locales. This is bogus,
            # since some systems may use other encodings for these
--- a/Lib/test/test_locale.py
+++ b/Lib/test/test_locale.py
@ -365,6 +365,64 @@ class TestEnUSCollation(BaseLocalizedTest, TestCollation):
        self.assertLess(locale.strxfrm('à'), locale.strxfrm('b'))
 class NormalizeTest(unittest.TestCase):
    def check(self, localename, expected):
        self.assertEqual(locale.normalize(localename), expected, msg=localename)
    def test_locale_alias(self):
        for localename, alias in locale.locale_alias.items():
            with self.subTest(locale=(localename, alias)):
                self.check(localename, alias)
    def test_empty(self):
        self.check('', '')
    def test_c(self):
        self.check('c', 'C')
        self.check('posix', 'C')
    def test_english(self):
        self.check('en', 'en_US.ISO8859-1')
        self.check('EN', 'en_US.ISO8859-1')
        self.check('en_US', 'en_US.ISO8859-1')
        self.check('en_us', 'en_US.ISO8859-1')
        self.check('en_GB', 'en_GB.ISO8859-1')
        self.check('en_US.UTF-8', 'en_US.UTF-8')
        self.check('en_US.utf8', 'en_US.UTF-8')
        self.check('en_US:UTF-8', 'en_US.UTF-8')
        self.check('en_US.ISO8859-1', 'en_US.ISO8859-1')
        self.check('en_US.US-ASCII', 'en_US.ISO8859-1')
        self.check('english', 'en_EN.ISO8859-1')
    def test_hyphenated_encoding(self):
        self.check('az_AZ.iso88599e', 'az_AZ.ISO8859-9E')
        self.check('az_AZ.ISO8859-9E', 'az_AZ.ISO8859-9E')
        self.check('tt_RU.koi8c', 'tt_RU.KOI8-C')
        self.check('tt_RU.KOI8-C', 'tt_RU.KOI8-C')
        self.check('lo_LA.cp1133', 'lo_LA.IBM-CP1133')
        self.check('lo_LA.ibmcp1133', 'lo_LA.IBM-CP1133')
        self.check('lo_LA.IBM-CP1133', 'lo_LA.IBM-CP1133')
        self.check('uk_ua.microsoftcp1251', 'uk_UA.CP1251')
        self.check('uk_ua.microsoft-cp1251', 'uk_UA.CP1251')
        self.check('ka_ge.georgianacademy', 'ka_GE.GEORGIAN-ACADEMY')
        self.check('ka_GE.GEORGIAN-ACADEMY', 'ka_GE.GEORGIAN-ACADEMY')
        self.check('cs_CZ.iso88592', 'cs_CZ.ISO8859-2')
        self.check('cs_CZ.ISO8859-2', 'cs_CZ.ISO8859-2')
    def test_euro_modifier(self):
        self.check('de_DE@euro', 'de_DE.ISO8859-15')
        self.check('en_US.ISO8859-15@euro', 'en_US.ISO8859-15')
    def test_latin_modifier(self):
        self.check('be_BY.UTF-8@latin', 'be_BY.UTF-8@latin')
        self.check('sr_RS.UTF-8@latin', 'sr_RS.UTF-8@latin')
    def test_valencia_modifier(self):
        self.check('ca_ES.UTF-8@valencia', 'ca_ES.UTF-8@valencia')
        self.check('ca_ES@valencia', 'ca_ES.ISO8859-1@valencia')
        self.check('ca@valencia', 'ca_ES.ISO8859-1@valencia')
 class TestMiscellaneous(unittest.TestCase):
    def test_getpreferredencoding(self):
        # Invoke getpreferredencoding to make sure it does not cause exceptions.