2000-03-10 19:17:24 -04:00
|
|
|
""" Standard "encodings" Package
|
|
|
|
|
|
|
|
Standard Python encoding modules are stored in this package
|
|
|
|
directory.
|
|
|
|
|
2002-10-04 08:45:38 -03:00
|
|
|
Codec modules must have names corresponding to normalized encoding
|
|
|
|
names as defined in the normalize_encoding() function below, e.g.
|
|
|
|
'utf-8' must be implemented by the module 'utf_8.py'.
|
2000-03-10 19:17:24 -04:00
|
|
|
|
|
|
|
Each codec module must export the following interface:
|
|
|
|
|
2006-03-15 07:35:15 -04:00
|
|
|
* getregentry() -> codecs.CodecInfo object
|
|
|
|
The getregentry() API must a CodecInfo object with encoder, decoder,
|
|
|
|
incrementalencoder, incrementaldecoder, streamwriter and streamreader
|
|
|
|
atttributes which adhere to the Python Codec Interface Standard.
|
2000-03-10 19:17:24 -04:00
|
|
|
|
|
|
|
In addition, a module may optionally also define the following
|
|
|
|
APIs which are then used by the package's codec search function:
|
|
|
|
|
|
|
|
* getaliases() -> sequence of encoding name strings to use as aliases
|
|
|
|
|
2002-10-04 08:45:38 -03:00
|
|
|
Alias names returned by getaliases() must be normalized encoding
|
|
|
|
names as defined by normalize_encoding().
|
2000-03-10 19:17:24 -04:00
|
|
|
|
|
|
|
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
|
|
|
|
|
|
|
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
|
|
|
|
|
|
|
|
"""#"
|
|
|
|
|
2006-08-24 22:52:49 -03:00
|
|
|
import codecs
|
2006-03-15 19:08:13 -04:00
|
|
|
from encodings import aliases
|
2008-01-23 10:20:50 -04:00
|
|
|
import __builtin__
|
2000-03-10 19:17:24 -04:00
|
|
|
|
|
|
|
_cache = {}
|
2000-03-20 12:36:48 -04:00
|
|
|
_unknown = '--unknown--'
|
2002-02-10 17:36:20 -04:00
|
|
|
_import_tail = ['*']
|
2003-05-16 14:07:51 -03:00
|
|
|
_norm_encoding_map = (' . '
|
|
|
|
'0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ '
|
|
|
|
' abcdefghijklmnopqrstuvwxyz '
|
|
|
|
' '
|
|
|
|
' '
|
|
|
|
' ')
|
2004-01-20 05:40:14 -04:00
|
|
|
_aliases = aliases.aliases
|
2000-03-10 19:17:24 -04:00
|
|
|
|
2005-08-31 21:45:28 -03:00
|
|
|
class CodecRegistryError(LookupError, SystemError):
|
2001-09-19 08:52:07 -03:00
|
|
|
pass
|
|
|
|
|
2002-10-04 08:45:38 -03:00
|
|
|
def normalize_encoding(encoding):
|
|
|
|
|
|
|
|
""" Normalize an encoding name.
|
|
|
|
|
|
|
|
Normalization works as follows: all non-alphanumeric
|
|
|
|
characters except the dot used for Python package names are
|
|
|
|
collapsed and replaced with a single underscore, e.g. ' -;#'
|
2003-05-16 14:07:51 -03:00
|
|
|
becomes '_'. Leading and trailing underscores are removed.
|
|
|
|
|
|
|
|
Note that encoding names should be ASCII only; if they do use
|
|
|
|
non-ASCII characters, these must be Latin-1 compatible.
|
2002-12-24 14:31:27 -04:00
|
|
|
|
2002-10-04 08:45:38 -03:00
|
|
|
"""
|
2003-05-16 14:07:51 -03:00
|
|
|
# Make sure we have an 8-bit string, because .translate() works
|
|
|
|
# differently for Unicode strings.
|
2008-01-23 10:20:50 -04:00
|
|
|
if hasattr(__builtin__, "unicode") and isinstance(encoding, unicode):
|
2003-05-16 14:07:51 -03:00
|
|
|
# Note that .encode('latin-1') does *not* use the codec
|
|
|
|
# registry, so this call doesn't recurse. (See unicodeobject.c
|
|
|
|
# PyUnicode_AsEncodedString() for details)
|
|
|
|
encoding = encoding.encode('latin-1')
|
|
|
|
return '_'.join(encoding.translate(_norm_encoding_map).split())
|
2002-10-04 08:45:38 -03:00
|
|
|
|
2000-03-10 19:17:24 -04:00
|
|
|
def search_function(encoding):
|
2002-08-08 17:19:19 -03:00
|
|
|
|
2000-03-10 19:17:24 -04:00
|
|
|
# Cache lookup
|
2002-02-10 17:36:20 -04:00
|
|
|
entry = _cache.get(encoding, _unknown)
|
2000-03-20 12:36:48 -04:00
|
|
|
if entry is not _unknown:
|
2000-03-10 19:17:24 -04:00
|
|
|
return entry
|
|
|
|
|
2002-02-10 17:36:20 -04:00
|
|
|
# Import the module:
|
|
|
|
#
|
2004-01-20 05:40:14 -04:00
|
|
|
# First try to find an alias for the normalized encoding
|
|
|
|
# name and lookup the module using the aliased name, then try to
|
|
|
|
# lookup the module using the standard import scheme, i.e. first
|
|
|
|
# try in the encodings package, then at top-level.
|
2002-02-10 17:36:20 -04:00
|
|
|
#
|
2004-01-20 05:40:14 -04:00
|
|
|
norm_encoding = normalize_encoding(encoding)
|
|
|
|
aliased_encoding = _aliases.get(norm_encoding) or \
|
|
|
|
_aliases.get(norm_encoding.replace('.', '_'))
|
|
|
|
if aliased_encoding is not None:
|
|
|
|
modnames = [aliased_encoding,
|
|
|
|
norm_encoding]
|
|
|
|
else:
|
|
|
|
modnames = [norm_encoding]
|
|
|
|
for modname in modnames:
|
2006-09-30 08:22:28 -03:00
|
|
|
if not modname or '.' in modname:
|
2004-01-20 05:40:14 -04:00
|
|
|
continue
|
2002-02-11 13:43:46 -04:00
|
|
|
try:
|
2007-02-16 15:33:01 -04:00
|
|
|
# Import is absolute to prevent the possibly malicious import of a
|
|
|
|
# module with side-effects that is not in the 'encodings' package.
|
|
|
|
mod = __import__('encodings.' + modname, fromlist=_import_tail,
|
|
|
|
level=0)
|
2002-07-29 11:05:24 -03:00
|
|
|
except ImportError:
|
2004-01-20 05:40:14 -04:00
|
|
|
pass
|
|
|
|
else:
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
mod = None
|
2002-07-29 11:05:24 -03:00
|
|
|
|
|
|
|
try:
|
|
|
|
getregentry = mod.getregentry
|
|
|
|
except AttributeError:
|
|
|
|
# Not a codec module
|
|
|
|
mod = None
|
|
|
|
|
2002-02-10 17:36:20 -04:00
|
|
|
if mod is None:
|
2002-02-11 13:43:46 -04:00
|
|
|
# Cache misses
|
2000-03-10 19:17:24 -04:00
|
|
|
_cache[encoding] = None
|
2002-08-08 17:19:19 -03:00
|
|
|
return None
|
|
|
|
|
2000-03-10 19:17:24 -04:00
|
|
|
# Now ask the module for the registry entry
|
2006-03-15 07:35:15 -04:00
|
|
|
entry = getregentry()
|
|
|
|
if not isinstance(entry, codecs.CodecInfo):
|
|
|
|
if not 4 <= len(entry) <= 7:
|
2006-03-15 14:08:37 -04:00
|
|
|
raise CodecRegistryError,\
|
|
|
|
'module "%s" (%s) failed to register' % \
|
|
|
|
(mod.__name__, mod.__file__)
|
2009-10-09 19:05:45 -03:00
|
|
|
if not hasattr(entry[0], '__call__') or \
|
|
|
|
not hasattr(entry[1], '__call__') or \
|
|
|
|
(entry[2] is not None and not hasattr(entry[2], '__call__')) or \
|
|
|
|
(entry[3] is not None and not hasattr(entry[3], '__call__')) or \
|
|
|
|
(len(entry) > 4 and entry[4] is not None and not hasattr(entry[4], '__call__')) or \
|
|
|
|
(len(entry) > 5 and entry[5] is not None and not hasattr(entry[5], '__call__')):
|
2001-09-19 08:52:07 -03:00
|
|
|
raise CodecRegistryError,\
|
2006-03-15 07:35:15 -04:00
|
|
|
'incompatible codecs in module "%s" (%s)' % \
|
|
|
|
(mod.__name__, mod.__file__)
|
|
|
|
if len(entry)<7 or entry[6] is None:
|
|
|
|
entry += (None,)*(6-len(entry)) + (mod.__name__.split(".", 1)[1],)
|
|
|
|
entry = codecs.CodecInfo(*entry)
|
2000-03-10 19:17:24 -04:00
|
|
|
|
2000-12-12 10:45:35 -04:00
|
|
|
# Cache the codec registry entry
|
2000-03-10 19:17:24 -04:00
|
|
|
_cache[encoding] = entry
|
2000-12-12 10:45:35 -04:00
|
|
|
|
|
|
|
# Register its aliases (without overwriting previously registered
|
|
|
|
# aliases)
|
2000-03-10 19:17:24 -04:00
|
|
|
try:
|
|
|
|
codecaliases = mod.getaliases()
|
|
|
|
except AttributeError:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
for alias in codecaliases:
|
2009-10-09 19:15:50 -03:00
|
|
|
if alias not in _aliases:
|
2004-01-20 05:40:14 -04:00
|
|
|
_aliases[alias] = modname
|
2000-12-12 10:45:35 -04:00
|
|
|
|
|
|
|
# Return the registry entry
|
2000-03-10 19:17:24 -04:00
|
|
|
return entry
|
|
|
|
|
|
|
|
# Register the search_function in the Python codec registry
|
|
|
|
codecs.register(search_function)
|