Issue #5815: Fixed support for locales with modifiers. Fixed support for

locale encodings with hyphens.
This commit is contained in:
Serhiy Storchaka 2013-12-19 21:21:40 +02:00
parent ed0b87d73c
commit 16f02d2b06
2 changed files with 131 additions and 41 deletions

View File

@ -336,6 +336,22 @@ def _test():
# overridden below) # overridden below)
_setlocale = setlocale _setlocale = setlocale
def _replace_encoding(code, encoding):
if '.' in code:
langname = code[:code.index('.')]
else:
langname = code
# Convert the encoding to a C lib compatible encoding string
norm_encoding = encodings.normalize_encoding(encoding)
#print('norm encoding: %r' % norm_encoding)
norm_encoding = encodings.aliases.aliases.get(norm_encoding,
norm_encoding)
#print('aliased encoding: %r' % norm_encoding)
encoding = locale_encoding_alias.get(norm_encoding,
norm_encoding)
#print('found encoding %r' % encoding)
return langname + '.' + encoding
def normalize(localename): def normalize(localename):
""" Returns a normalized locale code for the given locale """ Returns a normalized locale code for the given locale
@ -352,55 +368,71 @@ def normalize(localename):
does. does.
""" """
# Normalize the locale name and extract the encoding # Normalize the locale name and extract the encoding and modifier
fullname = localename.lower() code = localename.lower()
if ':' in fullname: if ':' in code:
# ':' is sometimes used as encoding delimiter. # ':' is sometimes used as encoding delimiter.
fullname = fullname.replace(':', '.') code = code.replace(':', '.')
if '.' in fullname: if '@' in code:
langname, encoding = fullname.split('.')[:2] code, modifier = code.split('@', 1)
fullname = langname + '.' + encoding
else: else:
langname = fullname modifier = ''
if '.' in code:
langname, encoding = code.split('.')[:2]
else:
langname = code
encoding = '' encoding = ''
# First lookup: fullname (possibly with encoding) # First lookup: fullname (possibly with encoding and modifier)
norm_encoding = encoding.replace('-', '') lang_enc = langname
norm_encoding = norm_encoding.replace('_', '') if encoding:
lookup_name = langname + '.' + encoding norm_encoding = encoding.replace('-', '')
norm_encoding = norm_encoding.replace('_', '')
lang_enc += '.' + norm_encoding
lookup_name = lang_enc
if modifier:
lookup_name += '@' + modifier
code = locale_alias.get(lookup_name, None) code = locale_alias.get(lookup_name, None)
if code is not None: if code is not None:
return code return code
#print 'first lookup failed' #print('first lookup failed')
# Second try: langname (without encoding) if modifier:
code = locale_alias.get(langname, None) # Second try: fullname without modifier (possibly with encoding)
if code is not None: code = locale_alias.get(lang_enc, None)
#print 'langname lookup succeeded' if code is not None:
if '.' in code: #print('lookup without modifier succeeded')
langname, defenc = code.split('.') if '@' not in code:
else: return code + '@' + modifier
langname = code if code.split('@', 1)[1].lower() == modifier:
defenc = '' return code
if encoding: #print('second lookup failed')
# Convert the encoding to a C lib compatible encoding string
norm_encoding = encodings.normalize_encoding(encoding)
#print 'norm encoding: %r' % norm_encoding
norm_encoding = encodings.aliases.aliases.get(norm_encoding,
norm_encoding)
#print 'aliased encoding: %r' % norm_encoding
encoding = locale_encoding_alias.get(norm_encoding,
norm_encoding)
else:
encoding = defenc
#print 'found encoding %r' % encoding
if encoding:
return langname + '.' + encoding
else:
return langname
else: if encoding:
return localename # Third try: langname (without encoding, possibly with modifier)
lookup_name = langname
if modifier:
lookup_name += '@' + modifier
code = locale_alias.get(lookup_name, None)
if code is not None:
#print('lookup without encoding succeeded')
if '@' not in code:
return _replace_encoding(code, encoding)
code, modifier = code.split('@', 1)
return _replace_encoding(code, encoding) + '@' + modifier
if modifier:
# Fourth try: langname (without encoding and modifier)
code = locale_alias.get(langname, None)
if code is not None:
#print('lookup without modifier and encoding succeeded')
if '@' not in code:
return _replace_encoding(code, encoding) + '@' + modifier
code, defmod = code.split('@', 1)
if defmod.lower() == modifier:
return _replace_encoding(code, encoding) + '@' + defmod
return localename
def _parse_localename(localename): def _parse_localename(localename):
@ -419,7 +451,7 @@ def _parse_localename(localename):
code = normalize(localename) code = normalize(localename)
if '@' in code: if '@' in code:
# Deal with locale modifiers # Deal with locale modifiers
code, modifier = code.split('@') code, modifier = code.split('@', 1)
if modifier == 'euro' and '.' not in code: if modifier == 'euro' and '.' not in code:
# Assume Latin-9 for @euro locales. This is bogus, # Assume Latin-9 for @euro locales. This is bogus,
# since some systems may use other encodings for these # since some systems may use other encodings for these

View File

@ -365,6 +365,64 @@ class TestEnUSCollation(BaseLocalizedTest, TestCollation):
self.assertLess(locale.strxfrm('à'), locale.strxfrm('b')) self.assertLess(locale.strxfrm('à'), locale.strxfrm('b'))
class NormalizeTest(unittest.TestCase):
def check(self, localename, expected):
self.assertEqual(locale.normalize(localename), expected, msg=localename)
def test_locale_alias(self):
for localename, alias in locale.locale_alias.items():
with self.subTest(locale=(localename, alias)):
self.check(localename, alias)
def test_empty(self):
self.check('', '')
def test_c(self):
self.check('c', 'C')
self.check('posix', 'C')
def test_english(self):
self.check('en', 'en_US.ISO8859-1')
self.check('EN', 'en_US.ISO8859-1')
self.check('en_US', 'en_US.ISO8859-1')
self.check('en_us', 'en_US.ISO8859-1')
self.check('en_GB', 'en_GB.ISO8859-1')
self.check('en_US.UTF-8', 'en_US.UTF-8')
self.check('en_US.utf8', 'en_US.UTF-8')
self.check('en_US:UTF-8', 'en_US.UTF-8')
self.check('en_US.ISO8859-1', 'en_US.ISO8859-1')
self.check('en_US.US-ASCII', 'en_US.ISO8859-1')
self.check('english', 'en_EN.ISO8859-1')
def test_hyphenated_encoding(self):
self.check('az_AZ.iso88599e', 'az_AZ.ISO8859-9E')
self.check('az_AZ.ISO8859-9E', 'az_AZ.ISO8859-9E')
self.check('tt_RU.koi8c', 'tt_RU.KOI8-C')
self.check('tt_RU.KOI8-C', 'tt_RU.KOI8-C')
self.check('lo_LA.cp1133', 'lo_LA.IBM-CP1133')
self.check('lo_LA.ibmcp1133', 'lo_LA.IBM-CP1133')
self.check('lo_LA.IBM-CP1133', 'lo_LA.IBM-CP1133')
self.check('uk_ua.microsoftcp1251', 'uk_UA.CP1251')
self.check('uk_ua.microsoft-cp1251', 'uk_UA.CP1251')
self.check('ka_ge.georgianacademy', 'ka_GE.GEORGIAN-ACADEMY')
self.check('ka_GE.GEORGIAN-ACADEMY', 'ka_GE.GEORGIAN-ACADEMY')
self.check('cs_CZ.iso88592', 'cs_CZ.ISO8859-2')
self.check('cs_CZ.ISO8859-2', 'cs_CZ.ISO8859-2')
def test_euro_modifier(self):
self.check('de_DE@euro', 'de_DE.ISO8859-15')
self.check('en_US.ISO8859-15@euro', 'en_US.ISO8859-15')
def test_latin_modifier(self):
self.check('be_BY.UTF-8@latin', 'be_BY.UTF-8@latin')
self.check('sr_RS.UTF-8@latin', 'sr_RS.UTF-8@latin')
def test_valencia_modifier(self):
self.check('ca_ES.UTF-8@valencia', 'ca_ES.UTF-8@valencia')
self.check('ca_ES@valencia', 'ca_ES.ISO8859-1@valencia')
self.check('ca@valencia', 'ca_ES.ISO8859-1@valencia')
class TestMiscellaneous(unittest.TestCase): class TestMiscellaneous(unittest.TestCase):
def test_getpreferredencoding(self): def test_getpreferredencoding(self):
# Invoke getpreferredencoding to make sure it does not cause exceptions. # Invoke getpreferredencoding to make sure it does not cause exceptions.