cpython/Lib/encodings/__init__.py

""" Standard "encodings" Package

    Standard Python encoding modules are stored in this package
    directory.

    Codec modules must have names corresponding to normalized encoding
    names as defined in the normalize_encoding() function below, e.g.
    'utf-8' must be implemented by the module 'utf_8.py'.

    Each codec module must export the following interface:

    * getregentry() -> codecs.CodecInfo object
    The getregentry() API must a CodecInfo object with encoder, decoder,
    incrementalencoder, incrementaldecoder, streamwriter and streamreader
    atttributes which adhere to the Python Codec Interface Standard.

    In addition, a module may optionally also define the following
    APIs which are then used by the package's codec search function:

    * getaliases() -> sequence of encoding name strings to use as aliases

    Alias names returned by getaliases() must be normalized encoding
    names as defined by normalize_encoding().

Written by Marc-Andre Lemburg (mal@lemburg.com).

(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.

"""#"

import codecs
from encodings import aliases
import __builtin__

_cache = {}
_unknown = '--unknown--'
_import_tail = ['*']
_norm_encoding_map = ('                                              . '
                      '0123456789       ABCDEFGHIJKLMNOPQRSTUVWXYZ     '
                      ' abcdefghijklmnopqrstuvwxyz                     '
                      '                                                '
                      '                                                '
                      '                ')
_aliases = aliases.aliases

class CodecRegistryError(LookupError, SystemError):
    pass

def normalize_encoding(encoding):

    """ Normalize an encoding name.

        Normalization works as follows: all non-alphanumeric
        characters except the dot used for Python package names are
        collapsed and replaced with a single underscore, e.g. '  -;#'
        becomes '_'. Leading and trailing underscores are removed.

        Note that encoding names should be ASCII only; if they do use
        non-ASCII characters, these must be Latin-1 compatible.

    """
    # Make sure we have an 8-bit string, because .translate() works
    # differently for Unicode strings.
    if hasattr(__builtin__, "unicode") and isinstance(encoding, unicode):
        # Note that .encode('latin-1') does *not* use the codec
        # registry, so this call doesn't recurse. (See unicodeobject.c
        # PyUnicode_AsEncodedString() for details)
        encoding = encoding.encode('latin-1')
    return '_'.join(encoding.translate(_norm_encoding_map).split())

def search_function(encoding):

    # Cache lookup
    entry = _cache.get(encoding, _unknown)
    if entry is not _unknown:
        return entry

    # Import the module:
    #
    # First try to find an alias for the normalized encoding
    # name and lookup the module using the aliased name, then try to
    # lookup the module using the standard import scheme, i.e. first
    # try in the encodings package, then at top-level.
    #
    norm_encoding = normalize_encoding(encoding)
    aliased_encoding = _aliases.get(norm_encoding) or \
                       _aliases.get(norm_encoding.replace('.', '_'))
    if aliased_encoding is not None:
        modnames = [aliased_encoding,
                    norm_encoding]
    else:
        modnames = [norm_encoding]
    for modname in modnames:
        if not modname or '.' in modname:
            continue
        try:
            # Import is absolute to prevent the possibly malicious import of a
            # module with side-effects that is not in the 'encodings' package.
            mod = __import__('encodings.' + modname, fromlist=_import_tail,
                             level=0)
        except ImportError:
            pass
        else:
            break
    else:
        mod = None

    try:
        getregentry = mod.getregentry
    except AttributeError:
        # Not a codec module
        mod = None

    if mod is None:
        # Cache misses
        _cache[encoding] = None
        return None

    # Now ask the module for the registry entry
    entry = getregentry()
    if not isinstance(entry, codecs.CodecInfo):
        if not 4 <= len(entry) <= 7:
            raise CodecRegistryError,\
                 'module "%s" (%s) failed to register' % \
                  (mod.__name__, mod.__file__)
        if not hasattr(entry[0], '__call__') or \
           not hasattr(entry[1], '__call__') or \
           (entry[2] is not None and not hasattr(entry[2], '__call__')) or \
           (entry[3] is not None and not hasattr(entry[3], '__call__')) or \
           (len(entry) > 4 and entry[4] is not None and not hasattr(entry[4], '__call__')) or \
           (len(entry) > 5 and entry[5] is not None and not hasattr(entry[5], '__call__')):
            raise CodecRegistryError,\
                'incompatible codecs in module "%s" (%s)' % \
                (mod.__name__, mod.__file__)
        if len(entry)<7 or entry[6] is None:
            entry += (None,)*(6-len(entry)) + (mod.__name__.split(".", 1)[1],)
        entry = codecs.CodecInfo(*entry)

    # Cache the codec registry entry
    _cache[encoding] = entry

    # Register its aliases (without overwriting previously registered
    # aliases)
    try:
        codecaliases = mod.getaliases()
    except AttributeError:
        pass
    else:
        for alias in codecaliases:
            if not _aliases.has_key(alias):
                _aliases[alias] = modname

    # Return the registry entry
    return entry

# Register the search_function in the Python codec registry
codecs.register(search_function)
Marc-Andre Lemburg: Unicode encodings. 2000-03-10 19:17:24 -04:00			`""" Standard "encodings" Package`

			`Standard Python encoding modules are stored in this package`
			`directory.`

Extending the encoding name normalization to handle more non-alphanumeric characters. 2002-10-04 08:45:38 -03:00			`Codec modules must have names corresponding to normalized encoding`
			`names as defined in the normalize_encoding() function below, e.g.`
			`'utf-8' must be implemented by the module 'utf_8.py'.`
Marc-Andre Lemburg: Unicode encodings. 2000-03-10 19:17:24 -04:00
			`Each codec module must export the following interface:`

Patch #1436130: codecs.lookup() now returns a CodecInfo object (a subclass of tuple) that provides incremental decoders and encoders (a way to use stateful codecs without the stream API). Functions codecs.getincrementaldecoder() and codecs.getincrementalencoder() have been added. 2006-03-15 07:35:15 -04:00			`* getregentry() -> codecs.CodecInfo object`
			`The getregentry() API must a CodecInfo object with encoder, decoder,`
			`incrementalencoder, incrementaldecoder, streamwriter and streamreader`
			`atttributes which adhere to the Python Codec Interface Standard.`
Marc-Andre Lemburg: Unicode encodings. 2000-03-10 19:17:24 -04:00
			`In addition, a module may optionally also define the following`
			`APIs which are then used by the package's codec search function:`

			`* getaliases() -> sequence of encoding name strings to use as aliases`

Extending the encoding name normalization to handle more non-alphanumeric characters. 2002-10-04 08:45:38 -03:00			`Alias names returned by getaliases() must be normalized encoding`
			`names as defined by normalize_encoding().`
Marc-Andre Lemburg: Unicode encodings. 2000-03-10 19:17:24 -04:00
			`Written by Marc-Andre Lemburg (mal@lemburg.com).`

			`(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.`

			`"""#"`

importing types is not necessary if we use isinstance 2006-08-24 22:52:49 -03:00			`import codecs`
Instead of relative imports, use (implicitly) absolute ones. 2006-03-15 19:08:13 -04:00			`from encodings import aliases`
Fixed bug #1915: Python compiles with --enable-unicode=no again. However several extension methods and modules do not work without unicode support. 2008-01-23 10:20:50 -04:00			`import __builtin__`
Marc-Andre Lemburg: Unicode encodings. 2000-03-10 19:17:24 -04:00
			`_cache = {}`
On 17-Mar-2000, Marc-Andre Lemburg said: Attached you find an update of the Unicode implementation. The patch is against the current CVS version. I would appreciate if someone with CVS checkin permissions could check the changes in. The patch contains all bugs and patches sent this week and also fixes a leak in the codecs code and a bug in the free list code for Unicode objects (which only shows up when compiling Python with Py_DEBUG; thanks to MarkH for spotting this one). 2000-03-20 12:36:48 -04:00			`_unknown = '--unknown--'`
Add IANA character set aliases to the encodings alias dictionary and make alias lookup lazy. Note that only those IANA character set aliases were added for which we actually have codecs in the encodings package. 2002-02-10 17:36:20 -04:00			`_import_tail = ['*']`
Remove usage of re module from encodings package search function. 2003-05-16 14:07:51 -03:00			`_norm_encoding_map = (' . '`
			`'0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ '`
			`' abcdefghijklmnopqrstuvwxyz '`
			`' '`
			`' '`
			`' ')`
Let the default encodings search function lookup aliases before trying the codec import. This allows applications to install codecs which override (non-special-cased) builtin codecs. 2004-01-20 05:40:14 -04:00			`_aliases = aliases.aliases`
Marc-Andre Lemburg: Unicode encodings. 2000-03-10 19:17:24 -04:00
No need to import exceptions, they are builtins 2005-08-31 21:45:28 -03:00			`class CodecRegistryError(LookupError, SystemError):`
Fixed search function error reporting in the encodings package __init__.py module to raise errors which can be catched as LookupErrors as well as SystemErrors. Modified the error messages to include more information about the failing module. 2001-09-19 08:52:07 -03:00			`pass`

Extending the encoding name normalization to handle more non-alphanumeric characters. 2002-10-04 08:45:38 -03:00			`def normalize_encoding(encoding):`

			`""" Normalize an encoding name.`

			`Normalization works as follows: all non-alphanumeric`
			`characters except the dot used for Python package names are`
			`collapsed and replaced with a single underscore, e.g. ' -;#'`
Remove usage of re module from encodings package search function. 2003-05-16 14:07:51 -03:00			`becomes '_'. Leading and trailing underscores are removed.`

			`Note that encoding names should be ASCII only; if they do use`
			`non-ASCII characters, these must be Latin-1 compatible.`
Whitespace normalization. 2002-12-24 14:31:27 -04:00
Extending the encoding name normalization to handle more non-alphanumeric characters. 2002-10-04 08:45:38 -03:00			`"""`
Remove usage of re module from encodings package search function. 2003-05-16 14:07:51 -03:00			`# Make sure we have an 8-bit string, because .translate() works`
			`# differently for Unicode strings.`
Fixed bug #1915: Python compiles with --enable-unicode=no again. However several extension methods and modules do not work without unicode support. 2008-01-23 10:20:50 -04:00			`if hasattr(__builtin__, "unicode") and isinstance(encoding, unicode):`
Remove usage of re module from encodings package search function. 2003-05-16 14:07:51 -03:00			`# Note that .encode('latin-1') does not use the codec`
			`# registry, so this call doesn't recurse. (See unicodeobject.c`
			`# PyUnicode_AsEncodedString() for details)`
			`encoding = encoding.encode('latin-1')`
			`return '_'.join(encoding.translate(_norm_encoding_map).split())`
Extending the encoding name normalization to handle more non-alphanumeric characters. 2002-10-04 08:45:38 -03:00
Marc-Andre Lemburg: Unicode encodings. 2000-03-10 19:17:24 -04:00			`def search_function(encoding):`
Whitespace normalization. 2002-08-08 17:19:19 -03:00
Marc-Andre Lemburg: Unicode encodings. 2000-03-10 19:17:24 -04:00			`# Cache lookup`
Add IANA character set aliases to the encodings alias dictionary and make alias lookup lazy. Note that only those IANA character set aliases were added for which we actually have codecs in the encodings package. 2002-02-10 17:36:20 -04:00			`entry = _cache.get(encoding, _unknown)`
On 17-Mar-2000, Marc-Andre Lemburg said: Attached you find an update of the Unicode implementation. The patch is against the current CVS version. I would appreciate if someone with CVS checkin permissions could check the changes in. The patch contains all bugs and patches sent this week and also fixes a leak in the codecs code and a bug in the free list code for Unicode objects (which only shows up when compiling Python with Py_DEBUG; thanks to MarkH for spotting this one). 2000-03-20 12:36:48 -04:00			`if entry is not _unknown:`
Marc-Andre Lemburg: Unicode encodings. 2000-03-10 19:17:24 -04:00			`return entry`

Add IANA character set aliases to the encodings alias dictionary and make alias lookup lazy. Note that only those IANA character set aliases were added for which we actually have codecs in the encodings package. 2002-02-10 17:36:20 -04:00			`# Import the module:`
			`#`
Let the default encodings search function lookup aliases before trying the codec import. This allows applications to install codecs which override (non-special-cased) builtin codecs. 2004-01-20 05:40:14 -04:00			`# First try to find an alias for the normalized encoding`
			`# name and lookup the module using the aliased name, then try to`
			`# lookup the module using the standard import scheme, i.e. first`
			`# try in the encodings package, then at top-level.`
Add IANA character set aliases to the encodings alias dictionary and make alias lookup lazy. Note that only those IANA character set aliases were added for which we actually have codecs in the encodings package. 2002-02-10 17:36:20 -04:00			`#`
Let the default encodings search function lookup aliases before trying the codec import. This allows applications to install codecs which override (non-special-cased) builtin codecs. 2004-01-20 05:40:14 -04:00			`norm_encoding = normalize_encoding(encoding)`
			`aliased_encoding = _aliases.get(norm_encoding) or \`
			`_aliases.get(norm_encoding.replace('.', '_'))`
			`if aliased_encoding is not None:`
			`modnames = [aliased_encoding,`
			`norm_encoding]`
			`else:`
			`modnames = [norm_encoding]`
			`for modname in modnames:`
Bug #1446043: correctly raise a LookupError if an encoding name given to encodings.search_function() contains a dot. 2006-09-30 08:22:28 -03:00			`if not modname or '.' in modname:`
Let the default encodings search function lookup aliases before trying the codec import. This allows applications to install codecs which override (non-special-cased) builtin codecs. 2004-01-20 05:40:14 -04:00			`continue`
Corrected import behaviour for codecs which live outside the encodings package. 2002-02-11 13:43:46 -04:00			`try:`
Make the __import__ call in encodings.__init__ absolute with a level 0 call. 2007-02-16 15:33:01 -04:00			`# Import is absolute to prevent the possibly malicious import of a`
			`# module with side-effects that is not in the 'encodings' package.`
			`mod = __import__('encodings.' + modname, fromlist=_import_tail,`
			`level=0)`
Revert #571603 since it is ok to import codecs that are not subdirectories of encodings. Skip modules that don't have a getregentry function. 2002-07-29 11:05:24 -03:00			`except ImportError:`
Let the default encodings search function lookup aliases before trying the codec import. This allows applications to install codecs which override (non-special-cased) builtin codecs. 2004-01-20 05:40:14 -04:00			`pass`
			`else:`
			`break`
			`else:`
			`mod = None`
Revert #571603 since it is ok to import codecs that are not subdirectories of encodings. Skip modules that don't have a getregentry function. 2002-07-29 11:05:24 -03:00
			`try:`
			`getregentry = mod.getregentry`
			`except AttributeError:`
			`# Not a codec module`
			`mod = None`

Add IANA character set aliases to the encodings alias dictionary and make alias lookup lazy. Note that only those IANA character set aliases were added for which we actually have codecs in the encodings package. 2002-02-10 17:36:20 -04:00			`if mod is None:`
Corrected import behaviour for codecs which live outside the encodings package. 2002-02-11 13:43:46 -04:00			`# Cache misses`
Marc-Andre Lemburg: Unicode encodings. 2000-03-10 19:17:24 -04:00			`_cache[encoding] = None`
Whitespace normalization. 2002-08-08 17:19:19 -03:00			`return None`

Marc-Andre Lemburg: Unicode encodings. 2000-03-10 19:17:24 -04:00			`# Now ask the module for the registry entry`
Patch #1436130: codecs.lookup() now returns a CodecInfo object (a subclass of tuple) that provides incremental decoders and encoders (a way to use stateful codecs without the stream API). Functions codecs.getincrementaldecoder() and codecs.getincrementalencoder() have been added. 2006-03-15 07:35:15 -04:00			`entry = getregentry()`
			`if not isinstance(entry, codecs.CodecInfo):`
			`if not 4 <= len(entry) <= 7:`
Whitespace normalization. 2006-03-15 14:08:37 -04:00			`raise CodecRegistryError,\`
			`'module "%s" (%s) failed to register' % \`
			`(mod.__name__, mod.__file__)`
replace callable() 2009-10-09 19:05:45 -03:00			`if not hasattr(entry[0], '__call__') or \`
			`not hasattr(entry[1], '__call__') or \`
			`(entry[2] is not None and not hasattr(entry[2], '__call__')) or \`
			`(entry[3] is not None and not hasattr(entry[3], '__call__')) or \`
			`(len(entry) > 4 and entry[4] is not None and not hasattr(entry[4], '__call__')) or \`
			`(len(entry) > 5 and entry[5] is not None and not hasattr(entry[5], '__call__')):`
Fixed search function error reporting in the encodings package __init__.py module to raise errors which can be catched as LookupErrors as well as SystemErrors. Modified the error messages to include more information about the failing module. 2001-09-19 08:52:07 -03:00			`raise CodecRegistryError,\`
Patch #1436130: codecs.lookup() now returns a CodecInfo object (a subclass of tuple) that provides incremental decoders and encoders (a way to use stateful codecs without the stream API). Functions codecs.getincrementaldecoder() and codecs.getincrementalencoder() have been added. 2006-03-15 07:35:15 -04:00			`'incompatible codecs in module "%s" (%s)' % \`
			`(mod.__name__, mod.__file__)`
			`if len(entry)<7 or entry[6] is None:`
			`entry += (None,)*(6-len(entry)) + (mod.__name__.split(".", 1)[1],)`
			`entry = codecs.CodecInfo(*entry)`
Marc-Andre Lemburg: Unicode encodings. 2000-03-10 19:17:24 -04:00
Changed .getaliases() support to register the new aliases in the encodings package aliases mapping dictionary rather than in the internal cache used by the search function. This enables aliases to take advantage of the full normalization process applied to encoding names which was previously not available. The patch restricts alias registration to new aliases. Existing aliases cannot be overridden anymore. 2000-12-12 10:45:35 -04:00			`# Cache the codec registry entry`
Marc-Andre Lemburg: Unicode encodings. 2000-03-10 19:17:24 -04:00			`_cache[encoding] = entry`
Changed .getaliases() support to register the new aliases in the encodings package aliases mapping dictionary rather than in the internal cache used by the search function. This enables aliases to take advantage of the full normalization process applied to encoding names which was previously not available. The patch restricts alias registration to new aliases. Existing aliases cannot be overridden anymore. 2000-12-12 10:45:35 -04:00
			`# Register its aliases (without overwriting previously registered`
			`# aliases)`
Marc-Andre Lemburg: Unicode encodings. 2000-03-10 19:17:24 -04:00			`try:`
			`codecaliases = mod.getaliases()`
			`except AttributeError:`
			`pass`
			`else:`
			`for alias in codecaliases:`
Let the default encodings search function lookup aliases before trying the codec import. This allows applications to install codecs which override (non-special-cased) builtin codecs. 2004-01-20 05:40:14 -04:00			`if not _aliases.has_key(alias):`
			`_aliases[alias] = modname`
Changed .getaliases() support to register the new aliases in the encodings package aliases mapping dictionary rather than in the internal cache used by the search function. This enables aliases to take advantage of the full normalization process applied to encoding names which was previously not available. The patch restricts alias registration to new aliases. Existing aliases cannot be overridden anymore. 2000-12-12 10:45:35 -04:00
			`# Return the registry entry`
Marc-Andre Lemburg: Unicode encodings. 2000-03-10 19:17:24 -04:00			`return entry`

			`# Register the search_function in the Python codec registry`
			`codecs.register(search_function)`