Close #7475: Restore binary & text transform codecs

The codecs themselves were restored in Python 3.2, this
completes the restoration by adding back the convenience
aliases.

These aliases were originally left out due to confusing
errors when attempting to use them with the text encoding
specific convenience methods. Python 3.4 includes several
improvements to those errors, thus permitting the aliases
to be restored as well.
This commit is contained in:
Nick Coghlan 2013-11-23 11:13:36 +10:00
parent 12820c0d5d
commit 9c1aed8f94
4 changed files with 139 additions and 77 deletions

View File

@ -1188,6 +1188,9 @@ common use case for codecs, the underlying codec infrastructure supports
arbitrary data transforms rather than just text encodings). For asymmetric
codecs, the stated purpose describes the encoding direction.
Text Encodings
^^^^^^^^^^^^^^
The following codecs provide :class:`str` to :class:`bytes` encoding and
:term:`bytes-like object` to :class:`str` decoding, similar to the Unicode text
encodings.
@ -1234,62 +1237,83 @@ encodings.
| | | .. deprecated:: 3.3 |
+--------------------+---------+---------------------------+
The following codecs provide :term:`bytes-like object` to :class:`bytes`
mappings.
.. _binary-transforms:
Binary Transforms
^^^^^^^^^^^^^^^^^
The following codecs provide binary transforms: :term:`bytes-like object`
to :class:`bytes` mappings.
.. tabularcolumns:: |l|L|L|
.. tabularcolumns:: |l|L|L|L|
+----------------------+------------------------------+------------------------------+
| Codec | Purpose | Encoder / decoder |
+======================+==============================+==============================+
| base64_codec [#b64]_ | Convert operand to MIME | :meth:`base64.b64encode` / |
| | base64 (the result always | :meth:`base64.b64decode` |
| | includes a trailing | |
| | ``'\n'``) | |
| | | |
| | .. versionchanged:: 3.4 | |
| | accepts any | |
| | :term:`bytes-like object` | |
| | as input for encoding and | |
| | decoding | |
+----------------------+------------------------------+------------------------------+
| bz2_codec | Compress the operand | :meth:`bz2.compress` / |
| | using bz2 | :meth:`bz2.decompress` |
+----------------------+------------------------------+------------------------------+
| hex_codec | Convert operand to | :meth:`base64.b16encode` / |
| | hexadecimal | :meth:`base64.b16decode` |
| | representation, with two | |
| | digits per byte | |
+----------------------+------------------------------+------------------------------+
| quopri_codec | Convert operand to MIME | :meth:`quopri.encodestring` /|
| | quoted printable | :meth:`quopri.decodestring` |
+----------------------+------------------------------+------------------------------+
| uu_codec | Convert the operand using | :meth:`uu.encode` / |
| | uuencode | :meth:`uu.decode` |
+----------------------+------------------------------+------------------------------+
| zlib_codec | Compress the operand | :meth:`zlib.compress` / |
| | using gzip | :meth:`zlib.decompress` |
+----------------------+------------------------------+------------------------------+
+----------------------+------------------+------------------------------+------------------------------+
| Codec | Aliases | Purpose | Encoder / decoder |
+======================+==================+==============================+==============================+
| base64_codec [#b64]_ | base64, base_64 | Convert operand to MIME | :meth:`base64.b64encode` / |
| | | base64 (the result always | :meth:`base64.b64decode` |
| | | includes a trailing | |
| | | ``'\n'``) | |
| | | | |
| | | .. versionchanged:: 3.4 | |
| | | accepts any | |
| | | :term:`bytes-like object` | |
| | | as input for encoding and | |
| | | decoding | |
+----------------------+------------------+------------------------------+------------------------------+
| bz2_codec | bz2 | Compress the operand | :meth:`bz2.compress` / |
| | | using bz2 | :meth:`bz2.decompress` |
+----------------------+------------------+------------------------------+------------------------------+
| hex_codec | hex | Convert operand to | :meth:`base64.b16encode` / |
| | | hexadecimal | :meth:`base64.b16decode` |
| | | representation, with two | |
| | | digits per byte | |
+----------------------+------------------+------------------------------+------------------------------+
| quopri_codec | quopri, | Convert operand to MIME | :meth:`quopri.encodestring` /|
| | quotedprintable, | quoted printable | :meth:`quopri.decodestring` |
| | quoted_printable | | |
+----------------------+------------------+------------------------------+------------------------------+
| uu_codec | uu | Convert the operand using | :meth:`uu.encode` / |
| | | uuencode | :meth:`uu.decode` |
+----------------------+------------------+------------------------------+------------------------------+
| zlib_codec | zip, zlib | Compress the operand | :meth:`zlib.compress` / |
| | | using gzip | :meth:`zlib.decompress` |
+----------------------+------------------+------------------------------+------------------------------+
.. [#b64] In addition to :term:`bytes-like objects <bytes-like object>`,
``'base64_codec'`` also accepts ASCII-only instances of :class:`str` for
decoding
.. versionadded:: 3.2
Restoration of the binary transforms.
The following codecs provide :class:`str` to :class:`str` mappings.
.. versionchanged:: 3.4
Restoration of the aliases for the binary transforms.
.. tabularcolumns:: |l|L|
+--------------------+---------------------------+
| Codec | Purpose |
+====================+===========================+
| rot_13 | Returns the Caesar-cypher |
| | encryption of the operand |
+--------------------+---------------------------+
.. _text-transforms:
Text Transforms
^^^^^^^^^^^^^^^
The following codec provides a text transform: a :class:`str` to :class:`str`
mapping.
.. tabularcolumns:: |l|l|L|
+--------------------+---------+---------------------------+
| Codec | Aliases | Purpose |
+====================+=========+===========================+
| rot_13 | rot13 | Returns the Caesar-cypher |
| | | encryption of the operand |
+--------------------+---------+---------------------------+
.. versionadded:: 3.2
bytes-to-bytes and str-to-str codecs.
Restoration of the ``rot_13`` text transform.
.. versionchanged:: 3.4
Restoration of the ``rot13`` alias.
:mod:`encodings.idna` --- Internationalized Domain Names in Applications

View File

@ -103,7 +103,8 @@ New expected features for Python implementations:
* :ref:`PEP 446: Make newly created file descriptors non-inheritable <pep-446>`.
* command line option for :ref:`isolated mode <using-on-misc-options>`,
(:issue:`16499`).
* improvements to handling of non-Unicode codecs
* :ref:`improvements <codec-handling-improvements>` in the handling of
codecs that are not text encodings
Significantly Improved Library Modules:
@ -173,8 +174,10 @@ PEP 446: Make newly created file descriptors non-inheritable
PEP written and implemented by Victor Stinner.
Improvements to handling of non-Unicode codecs
==============================================
.. _codec-handling-improvements:
Improvements to codec handling
==============================
Since it was first introduced, the :mod:`codecs` module has always been
intended to operate as a type-neutral dynamic encoding and decoding
@ -186,7 +189,7 @@ fact.
As a key step in clarifying the situation, the :meth:`codecs.encode` and
:meth:`codecs.decode` convenience functions are now properly documented in
Python 2.7, 3.3 and 3.4. These functions have existed in the :mod:`codecs`
module and have been covered by the regression test suite since Python 2.4,
module (and have been covered by the regression test suite) since Python 2.4,
but were previously only discoverable through runtime introspection.
Unlike the convenience methods on :class:`str`, :class:`bytes` and
@ -199,43 +202,58 @@ In Python 3.4, the interpreter is able to identify the known non-text
encodings provided in the standard library and direct users towards these
general purpose convenience functions when appropriate::
>>> import codecs
>>> b"abcdef".decode("hex_codec")
>>> b"abcdef".decode("hex")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
LookupError: 'hex_codec' is not a text encoding; use codecs.decode() to handle arbitrary codecs
LookupError: 'hex' is not a text encoding; use codecs.decode() to handle arbitrary codecs
>>> "hello".encode("rot_13")
>>> "hello".encode("rot13")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
LookupError: 'rot_13' is not a text encoding; use codecs.encode() to handle arbitrary codecs
LookupError: 'rot13' is not a text encoding; use codecs.encode() to handle arbitrary codecs
In a related change, whenever it is feasible without breaking backwards
compatibility, exceptions raised during encoding and decoding operations
will be wrapped in a chained exception of the same type that mentions the
name of the codec responsible for producing the error::
>>> codecs.decode(b"abcdefgh", "hex_codec")
>>> import codecs
>>> codecs.decode(b"abcdefgh", "hex")
binascii.Error: Non-hexadecimal digit found
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
binascii.Error: decoding with 'hex_codec' codec failed (Error: Non-hexadecimal digit found)
binascii.Error: decoding with 'hex' codec failed (Error: Non-hexadecimal digit found)
>>> codecs.encode("hello", "bz2_codec")
>>> codecs.encode("hello", "bz2")
TypeError: 'str' does not support the buffer interface
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: encoding with 'bz2_codec' codec failed (TypeError: 'str' does not support the buffer interface)
TypeError: encoding with 'bz2' codec failed (TypeError: 'str' does not support the buffer interface)
(Contributed by Nick Coghlan in :issue:`17827`, :issue:`17828` and
:issue:`19619`)
Finally, as the examples above show, these improvements have permitted
the restoration of the convenience aliases for the non-Unicode codecs that
were themselves restored in Python 3.2. This means that encoding binary data
to and from its hexadecimal representation (for example) can now be written
as::
>>> from codecs import encode, decode
>>> encode(b"hello", "hex")
b'68656c6c6f'
>>> decode(b"68656c6c6f", "hex")
b'hello'
The binary and text transforms provided in the standard library are detailed
in :ref:`binary-transforms` and :ref:`text-transforms`.
(Contributed by Nick Coghlan in :issue:`7475`, , :issue:`17827`,
:issue:`17828` and :issue:`19619`)
.. _pep-451:

View File

@ -33,9 +33,9 @@ aliases = {
'us' : 'ascii',
'us_ascii' : 'ascii',
## base64_codec codec
#'base64' : 'base64_codec',
#'base_64' : 'base64_codec',
# base64_codec codec
'base64' : 'base64_codec',
'base_64' : 'base64_codec',
# big5 codec
'big5_tw' : 'big5',
@ -45,8 +45,8 @@ aliases = {
'big5_hkscs' : 'big5hkscs',
'hkscs' : 'big5hkscs',
## bz2_codec codec
#'bz2' : 'bz2_codec',
# bz2_codec codec
'bz2' : 'bz2_codec',
# cp037 codec
'037' : 'cp037',
@ -248,8 +248,8 @@ aliases = {
'cp936' : 'gbk',
'ms936' : 'gbk',
## hex_codec codec
#'hex' : 'hex_codec',
# hex_codec codec
'hex' : 'hex_codec',
# hp_roman8 codec
'roman8' : 'hp_roman8',
@ -450,13 +450,13 @@ aliases = {
'cp154' : 'ptcp154',
'cyrillic_asian' : 'ptcp154',
## quopri_codec codec
#'quopri' : 'quopri_codec',
#'quoted_printable' : 'quopri_codec',
#'quotedprintable' : 'quopri_codec',
# quopri_codec codec
'quopri' : 'quopri_codec',
'quoted_printable' : 'quopri_codec',
'quotedprintable' : 'quopri_codec',
## rot_13 codec
#'rot13' : 'rot_13',
# rot_13 codec
'rot13' : 'rot_13',
# shift_jis codec
'csshiftjis' : 'shift_jis',
@ -518,12 +518,12 @@ aliases = {
'utf8_ucs2' : 'utf_8',
'utf8_ucs4' : 'utf_8',
## uu_codec codec
#'uu' : 'uu_codec',
# uu_codec codec
'uu' : 'uu_codec',
## zlib_codec codec
#'zip' : 'zlib_codec',
#'zlib' : 'zlib_codec',
# zlib_codec codec
'zip' : 'zlib_codec',
'zlib' : 'zlib_codec',
# temporary mac CJK aliases, will be replaced by proper codecs in 3.1
'x_mac_japanese' : 'shift_jis',

View File

@ -2320,18 +2320,29 @@ bytes_transform_encodings = [
"quopri_codec",
"hex_codec",
]
transform_aliases = {
"base64_codec": ["base64", "base_64"],
"uu_codec": ["uu"],
"quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
"hex_codec": ["hex"],
"rot_13": ["rot13"],
}
try:
import zlib
except ImportError:
pass
else:
bytes_transform_encodings.append("zlib_codec")
transform_aliases["zlib_codec"] = ["zip", "zlib"]
try:
import bz2
except ImportError:
pass
else:
bytes_transform_encodings.append("bz2_codec")
transform_aliases["bz2_codec"] = ["bz2"]
class TransformCodecTest(unittest.TestCase):
@ -2445,6 +2456,15 @@ class TransformCodecTest(unittest.TestCase):
# Unfortunately, the bz2 module throws OSError, which the codec
# machinery currently can't wrap :(
# Ensure codec aliases from http://bugs.python.org/issue7475 work
def test_aliases(self):
for codec_name, aliases in transform_aliases.items():
expected_name = codecs.lookup(codec_name).name
for alias in aliases:
with self.subTest(alias=alias):
info = codecs.lookup(alias)
self.assertEqual(info.name, expected_name)
# The codec system tries to wrap exceptions in order to ensure the error
# mentions the operation being performed and the codec involved. We