From 02524629f39bb70f4ea00ab8e64d694e08719227 Mon Sep 17 00:00:00 2001 From: Georg Brandl Date: Thu, 2 Dec 2010 18:06:51 +0000 Subject: [PATCH] #7475: add (un)transform method to bytes/bytearray and str, add back codecs that can be used with them from Python 2. --- Doc/library/codecs.rst | 40 ++++++++++++ Doc/library/stdtypes.rst | 44 +++++++++++++ Lib/codecs.py | 23 ++++--- Lib/encodings/aliases.py | 36 +++++------ Lib/encodings/base64_codec.py | 55 +++++++++++++++++ Lib/encodings/bz2_codec.py | 77 +++++++++++++++++++++++ Lib/encodings/hex_codec.py | 55 +++++++++++++++++ Lib/encodings/quopri_codec.py | 56 +++++++++++++++++ Lib/encodings/rot_13.py | 113 ++++++++++++++++++++++++++++++++++ Lib/encodings/uu_codec.py | 99 +++++++++++++++++++++++++++++ Lib/encodings/zlib_codec.py | 77 +++++++++++++++++++++++ Lib/test/test_bytes.py | 5 ++ Lib/test/test_codecs.py | 62 +++++++++++++++++++ Misc/NEWS | 4 ++ Objects/bytearrayobject.c | 73 ++++++++++++++++++++++ Objects/bytesobject.c | 64 +++++++++++++++++++ Objects/unicodeobject.c | 46 +++++++++++++- 17 files changed, 900 insertions(+), 29 deletions(-) create mode 100644 Lib/encodings/base64_codec.py create mode 100644 Lib/encodings/bz2_codec.py create mode 100644 Lib/encodings/hex_codec.py create mode 100644 Lib/encodings/quopri_codec.py create mode 100755 Lib/encodings/rot_13.py create mode 100644 Lib/encodings/uu_codec.py create mode 100644 Lib/encodings/zlib_codec.py diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst index dcfc46f0473..5416d3b2593 100644 --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -1165,6 +1165,46 @@ particular, the following variants typically exist: | | | operand | +--------------------+---------+---------------------------+ +The following codecs provide bytes-to-bytes mappings. They can be used with +:meth:`bytes.transform` and :meth:`bytes.untransform`. + ++--------------------+---------------------------+---------------------------+ +| Codec | Aliases | Purpose | ++====================+===========================+===========================+ +| base64_codec | base64, base-64 | Convert operand to MIME | +| | | base64 | ++--------------------+---------------------------+---------------------------+ +| bz2_codec | bz2 | Compress the operand | +| | | using bz2 | ++--------------------+---------------------------+---------------------------+ +| hex_codec | hex | Convert operand to | +| | | hexadecimal | +| | | representation, with two | +| | | digits per byte | ++--------------------+---------------------------+---------------------------+ +| quopri_codec | quopri, quoted-printable, | Convert operand to MIME | +| | quotedprintable | quoted printable | ++--------------------+---------------------------+---------------------------+ +| uu_codec | uu | Convert the operand using | +| | | uuencode | ++--------------------+---------------------------+---------------------------+ +| zlib_codec | zip, zlib | Compress the operand | +| | | using gzip | ++--------------------+---------------------------+---------------------------+ + +The following codecs provide string-to-string mappings. They can be used with +:meth:`str.transform` and :meth:`str.untransform`. + ++--------------------+---------------------------+---------------------------+ +| Codec | Aliases | Purpose | ++====================+===========================+===========================+ +| rot_13 | rot13 | Returns the Caesar-cypher | +| | | encryption of the operand | ++--------------------+---------------------------+---------------------------+ + +.. versionadded:: 3.2 + bytes-to-bytes and string-to-string codecs. + :mod:`encodings.idna` --- Internationalized Domain Names in Applications ------------------------------------------------------------------------ diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst index b2931ae48a7..6ebf2a03fa7 100644 --- a/Doc/library/stdtypes.rst +++ b/Doc/library/stdtypes.rst @@ -1352,6 +1352,19 @@ functions based on regular expressions. "They're Bill's Friends." +.. method:: str.transform(encoding, errors='strict') + + Return an encoded version of the string. In contrast to :meth:`encode`, this + method works with codecs that provide string-to-string mappings, and not + string-to-bytes mappings. :meth:`transform` therefore returns a string + object. + + The codecs that can be used with this method are listed in + :ref:`standard-encodings`. + + .. versionadded:: 3.2 + + .. method:: str.translate(map) Return a copy of the *s* where all characters have been mapped through the @@ -1369,6 +1382,14 @@ functions based on regular expressions. example). +.. method:: str.untransform(encoding, errors='strict') + + Return a decoded version of the string. This provides the reverse operation + of :meth:`transform`. + + .. versionadded:: 3.2 + + .. method:: str.upper() Return a copy of the string converted to uppercase. @@ -1800,6 +1821,20 @@ The bytes and bytearray types have an additional class method: The maketrans and translate methods differ in semantics from the versions available on strings: +.. method:: bytes.transform(encoding, errors='strict') + bytearray.transform(encoding, errors='strict') + + Return an encoded version of the bytes object. In contrast to + :meth:`encode`, this method works with codecs that provide bytes-to-bytes + mappings, and not string-to-bytes mappings. :meth:`transform` therefore + returns a bytes or bytearray object. + + The codecs that can be used with this method are listed in + :ref:`standard-encodings`. + + .. versionadded:: 3.2 + + .. method:: bytes.translate(table[, delete]) bytearray.translate(table[, delete]) @@ -1817,6 +1852,15 @@ available on strings: b'rd ths shrt txt' +.. method:: bytes.untransform(encoding, errors='strict') + bytearray.untransform(encoding, errors='strict') + + Return an decoded version of the bytes object. This provides the reverse + operation of :meth:`transform`. + + .. versionadded:: 3.2 + + .. staticmethod:: bytes.maketrans(from, to) bytearray.maketrans(from, to) diff --git a/Lib/codecs.py b/Lib/codecs.py index f6c24481687..b150d64d53c 100644 --- a/Lib/codecs.py +++ b/Lib/codecs.py @@ -396,6 +396,8 @@ class StreamWriter(Codec): class StreamReader(Codec): + charbuffertype = str + def __init__(self, stream, errors='strict'): """ Creates a StreamReader instance. @@ -417,9 +419,8 @@ class StreamReader(Codec): self.stream = stream self.errors = errors self.bytebuffer = b"" - # For str->str decoding this will stay a str - # For str->unicode decoding the first read will promote it to unicode - self.charbuffer = "" + self._empty_charbuffer = self.charbuffertype() + self.charbuffer = self._empty_charbuffer self.linebuffer = None def decode(self, input, errors='strict'): @@ -455,7 +456,7 @@ class StreamReader(Codec): """ # If we have lines cached, first merge them back into characters if self.linebuffer: - self.charbuffer = "".join(self.linebuffer) + self.charbuffer = self._empty_charbuffer.join(self.linebuffer) self.linebuffer = None # read until we get the required number of characters (if available) @@ -498,7 +499,7 @@ class StreamReader(Codec): if chars < 0: # Return everything we've got result = self.charbuffer - self.charbuffer = "" + self.charbuffer = self._empty_charbuffer else: # Return the first chars characters result = self.charbuffer[:chars] @@ -529,7 +530,7 @@ class StreamReader(Codec): return line readsize = size or 72 - line = "" + line = self._empty_charbuffer # If size is given, we call read() only once while True: data = self.read(readsize, firstline=True) @@ -537,7 +538,8 @@ class StreamReader(Codec): # If we're at a "\r" read one extra character (which might # be a "\n") to get a proper line ending. If the stream is # temporarily exhausted we return the wrong line ending. - if data.endswith("\r"): + if (isinstance(data, str) and data.endswith("\r")) or \ + (isinstance(data, bytes) and data.endswith(b"\r")): data += self.read(size=1, chars=1) line += data @@ -563,7 +565,8 @@ class StreamReader(Codec): line0withoutend = lines[0].splitlines(False)[0] if line0withend != line0withoutend: # We really have a line end # Put the rest back together and keep it until the next call - self.charbuffer = "".join(lines[1:]) + self.charbuffer + self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \ + self.charbuffer if keepends: line = line0withend else: @@ -574,7 +577,7 @@ class StreamReader(Codec): if line and not keepends: line = line.splitlines(False)[0] break - if readsize<8000: + if readsize < 8000: readsize *= 2 return line @@ -603,7 +606,7 @@ class StreamReader(Codec): """ self.bytebuffer = b"" - self.charbuffer = "" + self.charbuffer = self._empty_charbuffer self.linebuffer = None def seek(self, offset, whence=0): diff --git a/Lib/encodings/aliases.py b/Lib/encodings/aliases.py index 235deb5c309..331095b1f16 100644 --- a/Lib/encodings/aliases.py +++ b/Lib/encodings/aliases.py @@ -33,9 +33,9 @@ aliases = { 'us' : 'ascii', 'us_ascii' : 'ascii', - ## base64_codec codec - #'base64' : 'base64_codec', - #'base_64' : 'base64_codec', + # base64_codec codec + 'base64' : 'base64_codec', + 'base_64' : 'base64_codec', # big5 codec 'big5_tw' : 'big5', @@ -45,8 +45,8 @@ aliases = { 'big5_hkscs' : 'big5hkscs', 'hkscs' : 'big5hkscs', - ## bz2_codec codec - #'bz2' : 'bz2_codec', + # bz2_codec codec + 'bz2' : 'bz2_codec', # cp037 codec '037' : 'cp037', @@ -248,8 +248,8 @@ aliases = { 'cp936' : 'gbk', 'ms936' : 'gbk', - ## hex_codec codec - #'hex' : 'hex_codec', + # hex_codec codec + 'hex' : 'hex_codec', # hp_roman8 codec 'roman8' : 'hp_roman8', @@ -450,13 +450,13 @@ aliases = { 'cp154' : 'ptcp154', 'cyrillic_asian' : 'ptcp154', - ## quopri_codec codec - #'quopri' : 'quopri_codec', - #'quoted_printable' : 'quopri_codec', - #'quotedprintable' : 'quopri_codec', + # quopri_codec codec + 'quopri' : 'quopri_codec', + 'quoted_printable' : 'quopri_codec', + 'quotedprintable' : 'quopri_codec', - ## rot_13 codec - #'rot13' : 'rot_13', + # rot_13 codec + 'rot13' : 'rot_13', # shift_jis codec 'csshiftjis' : 'shift_jis', @@ -518,12 +518,12 @@ aliases = { 'utf8_ucs2' : 'utf_8', 'utf8_ucs4' : 'utf_8', - ## uu_codec codec - #'uu' : 'uu_codec', + # uu_codec codec + 'uu' : 'uu_codec', - ## zlib_codec codec - #'zip' : 'zlib_codec', - #'zlib' : 'zlib_codec', + # zlib_codec codec + 'zip' : 'zlib_codec', + 'zlib' : 'zlib_codec', # temporary mac CJK aliases, will be replaced by proper codecs in 3.1 'x_mac_japanese' : 'shift_jis', diff --git a/Lib/encodings/base64_codec.py b/Lib/encodings/base64_codec.py new file mode 100644 index 00000000000..e8b19ee237d --- /dev/null +++ b/Lib/encodings/base64_codec.py @@ -0,0 +1,55 @@ +"""Python 'base64_codec' Codec - base64 content transfer encoding. + +This codec de/encodes from bytes to bytes and is therefore usable with +bytes.transform() and bytes.untransform(). + +Written by Marc-Andre Lemburg (mal@lemburg.com). +""" + +import codecs +import base64 + +### Codec APIs + +def base64_encode(input, errors='strict'): + assert errors == 'strict' + return (base64.encodestring(input), len(input)) + +def base64_decode(input, errors='strict'): + assert errors == 'strict' + return (base64.decodestring(input), len(input)) + +class Codec(codecs.Codec): + def encode(self, input, errors='strict'): + return base64_encode(input, errors) + def decode(self, input, errors='strict'): + return base64_decode(input, errors) + +class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + assert self.errors == 'strict' + return base64.encodestring(input) + +class IncrementalDecoder(codecs.IncrementalDecoder): + def decode(self, input, final=False): + assert self.errors == 'strict' + return base64.decodestring(input) + +class StreamWriter(Codec, codecs.StreamWriter): + charbuffertype = bytes + +class StreamReader(Codec, codecs.StreamReader): + charbuffertype = bytes + +### encodings module API + +def getregentry(): + return codecs.CodecInfo( + name='base64', + encode=base64_encode, + decode=base64_decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamwriter=StreamWriter, + streamreader=StreamReader, + ) diff --git a/Lib/encodings/bz2_codec.py b/Lib/encodings/bz2_codec.py new file mode 100644 index 00000000000..e65d226bfdf --- /dev/null +++ b/Lib/encodings/bz2_codec.py @@ -0,0 +1,77 @@ +"""Python 'bz2_codec' Codec - bz2 compression encoding. + +This codec de/encodes from bytes to bytes and is therefore usable with +bytes.transform() and bytes.untransform(). + +Adapted by Raymond Hettinger from zlib_codec.py which was written +by Marc-Andre Lemburg (mal@lemburg.com). +""" + +import codecs +import bz2 # this codec needs the optional bz2 module ! + +### Codec APIs + +def bz2_encode(input, errors='strict'): + assert errors == 'strict' + return (bz2.compress(input), len(input)) + +def bz2_decode(input, errors='strict'): + assert errors == 'strict' + return (bz2.decompress(input), len(input)) + +class Codec(codecs.Codec): + def encode(self, input, errors='strict'): + return bz2_encode(input, errors) + def decode(self, input, errors='strict'): + return bz2_decode(input, errors) + +class IncrementalEncoder(codecs.IncrementalEncoder): + def __init__(self, errors='strict'): + assert errors == 'strict' + self.errors = errors + self.compressobj = bz2.BZ2Compressor() + + def encode(self, input, final=False): + if final: + c = self.compressobj.compress(input) + return c + self.compressobj.flush() + else: + return self.compressobj.compress(input) + + def reset(self): + self.compressobj = bz2.BZ2Compressor() + +class IncrementalDecoder(codecs.IncrementalDecoder): + def __init__(self, errors='strict'): + assert errors == 'strict' + self.errors = errors + self.decompressobj = bz2.BZ2Decompressor() + + def decode(self, input, final=False): + try: + return self.decompressobj.decompress(input) + except EOFError: + return '' + + def reset(self): + self.decompressobj = bz2.BZ2Decompressor() + +class StreamWriter(Codec, codecs.StreamWriter): + charbuffertype = bytes + +class StreamReader(Codec, codecs.StreamReader): + charbuffertype = bytes + +### encodings module API + +def getregentry(): + return codecs.CodecInfo( + name="bz2", + encode=bz2_encode, + decode=bz2_decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamwriter=StreamWriter, + streamreader=StreamReader, + ) diff --git a/Lib/encodings/hex_codec.py b/Lib/encodings/hex_codec.py new file mode 100644 index 00000000000..e003fc3ff97 --- /dev/null +++ b/Lib/encodings/hex_codec.py @@ -0,0 +1,55 @@ +"""Python 'hex_codec' Codec - 2-digit hex content transfer encoding. + +This codec de/encodes from bytes to bytes and is therefore usable with +bytes.transform() and bytes.untransform(). + +Written by Marc-Andre Lemburg (mal@lemburg.com). +""" + +import codecs +import binascii + +### Codec APIs + +def hex_encode(input, errors='strict'): + assert errors == 'strict' + return (binascii.b2a_hex(input), len(input)) + +def hex_decode(input, errors='strict'): + assert errors == 'strict' + return (binascii.a2b_hex(input), len(input)) + +class Codec(codecs.Codec): + def encode(self, input, errors='strict'): + return hex_encode(input, errors) + def decode(self, input, errors='strict'): + return hex_decode(input, errors) + +class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + assert self.errors == 'strict' + return binascii.b2a_hex(input) + +class IncrementalDecoder(codecs.IncrementalDecoder): + def decode(self, input, final=False): + assert self.errors == 'strict' + return binascii.a2b_hex(input) + +class StreamWriter(Codec, codecs.StreamWriter): + charbuffertype = bytes + +class StreamReader(Codec, codecs.StreamReader): + charbuffertype = bytes + +### encodings module API + +def getregentry(): + return codecs.CodecInfo( + name='hex', + encode=hex_encode, + decode=hex_decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamwriter=StreamWriter, + streamreader=StreamReader, + ) diff --git a/Lib/encodings/quopri_codec.py b/Lib/encodings/quopri_codec.py new file mode 100644 index 00000000000..9243fc443b0 --- /dev/null +++ b/Lib/encodings/quopri_codec.py @@ -0,0 +1,56 @@ +"""Codec for quoted-printable encoding. + +This codec de/encodes from bytes to bytes and is therefore usable with +bytes.transform() and bytes.untransform(). +""" + +import codecs +import quopri +from io import BytesIO + +def quopri_encode(input, errors='strict'): + assert errors == 'strict' + f = BytesIO(input) + g = BytesIO() + quopri.encode(f, g, 1) + return (g.getvalue(), len(input)) + +def quopri_decode(input, errors='strict'): + assert errors == 'strict' + f = BytesIO(input) + g = BytesIO() + quopri.decode(f, g) + return (g.getvalue(), len(input)) + +class Codec(codecs.Codec): + def encode(self, input, errors='strict'): + return quopri_encode(input, errors) + def decode(self, input, errors='strict'): + return quopri_decode(input, errors) + +class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + return quopri_encode(input, self.errors)[0] + +class IncrementalDecoder(codecs.IncrementalDecoder): + def decode(self, input, final=False): + return quopri_decode(input, self.errors)[0] + +class StreamWriter(Codec, codecs.StreamWriter): + charbuffertype = bytes + +class StreamReader(Codec, codecs.StreamReader): + charbuffertype = bytes + +# encodings module API + +def getregentry(): + return codecs.CodecInfo( + name='quopri', + encode=quopri_encode, + decode=quopri_decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamwriter=StreamWriter, + streamreader=StreamReader, + ) diff --git a/Lib/encodings/rot_13.py b/Lib/encodings/rot_13.py new file mode 100755 index 00000000000..3140c1432dc --- /dev/null +++ b/Lib/encodings/rot_13.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python +""" Python Character Mapping Codec for ROT13. + +This codec de/encodes from str to str and is therefore usable with +str.transform() and str.untransform(). + +Written by Marc-Andre Lemburg (mal@lemburg.com). +""" + +import codecs + +### Codec APIs + +class Codec(codecs.Codec): + def encode(self, input, errors='strict'): + return (input.translate(rot13_map), len(input)) + + def decode(self, input, errors='strict'): + return (input.translate(rot13_map), len(input)) + +class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + return input.translate(rot13_map) + +class IncrementalDecoder(codecs.IncrementalDecoder): + def decode(self, input, final=False): + return input.translate(rot13_map) + +class StreamWriter(Codec,codecs.StreamWriter): + pass + +class StreamReader(Codec,codecs.StreamReader): + pass + +### encodings module API + +def getregentry(): + return codecs.CodecInfo( + name='rot-13', + encode=Codec().encode, + decode=Codec().decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamwriter=StreamWriter, + streamreader=StreamReader, + ) + +### Map + +rot13_map = codecs.make_identity_dict(range(256)) +rot13_map.update({ + 0x0041: 0x004e, + 0x0042: 0x004f, + 0x0043: 0x0050, + 0x0044: 0x0051, + 0x0045: 0x0052, + 0x0046: 0x0053, + 0x0047: 0x0054, + 0x0048: 0x0055, + 0x0049: 0x0056, + 0x004a: 0x0057, + 0x004b: 0x0058, + 0x004c: 0x0059, + 0x004d: 0x005a, + 0x004e: 0x0041, + 0x004f: 0x0042, + 0x0050: 0x0043, + 0x0051: 0x0044, + 0x0052: 0x0045, + 0x0053: 0x0046, + 0x0054: 0x0047, + 0x0055: 0x0048, + 0x0056: 0x0049, + 0x0057: 0x004a, + 0x0058: 0x004b, + 0x0059: 0x004c, + 0x005a: 0x004d, + 0x0061: 0x006e, + 0x0062: 0x006f, + 0x0063: 0x0070, + 0x0064: 0x0071, + 0x0065: 0x0072, + 0x0066: 0x0073, + 0x0067: 0x0074, + 0x0068: 0x0075, + 0x0069: 0x0076, + 0x006a: 0x0077, + 0x006b: 0x0078, + 0x006c: 0x0079, + 0x006d: 0x007a, + 0x006e: 0x0061, + 0x006f: 0x0062, + 0x0070: 0x0063, + 0x0071: 0x0064, + 0x0072: 0x0065, + 0x0073: 0x0066, + 0x0074: 0x0067, + 0x0075: 0x0068, + 0x0076: 0x0069, + 0x0077: 0x006a, + 0x0078: 0x006b, + 0x0079: 0x006c, + 0x007a: 0x006d, +}) + +### Filter API + +def rot13(infile, outfile): + outfile.write(infile.read().encode('rot-13')) + +if __name__ == '__main__': + import sys + rot13(sys.stdin, sys.stdout) diff --git a/Lib/encodings/uu_codec.py b/Lib/encodings/uu_codec.py new file mode 100644 index 00000000000..69c6f17c7f3 --- /dev/null +++ b/Lib/encodings/uu_codec.py @@ -0,0 +1,99 @@ +"""Python 'uu_codec' Codec - UU content transfer encoding. + +This codec de/encodes from bytes to bytes and is therefore usable with +bytes.transform() and bytes.untransform(). + +Written by Marc-Andre Lemburg (mal@lemburg.com). Some details were +adapted from uu.py which was written by Lance Ellinghouse and +modified by Jack Jansen and Fredrik Lundh. +""" + +import codecs +import binascii +from io import BytesIO + +### Codec APIs + +def uu_encode(input, errors='strict', filename='', mode=0o666): + assert errors == 'strict' + infile = BytesIO(input) + outfile = BytesIO() + read = infile.read + write = outfile.write + + # Encode + write(('begin %o %s\n' % (mode & 0o777, filename)).encode('ascii')) + chunk = read(45) + while chunk: + write(binascii.b2a_uu(chunk)) + chunk = read(45) + write(b' \nend\n') + + return (outfile.getvalue(), len(input)) + +def uu_decode(input, errors='strict'): + assert errors == 'strict' + infile = BytesIO(input) + outfile = BytesIO() + readline = infile.readline + write = outfile.write + + # Find start of encoded data + while 1: + s = readline() + if not s: + raise ValueError('Missing "begin" line in input data') + if s[:5] == b'begin': + break + + # Decode + while True: + s = readline() + if not s or s == b'end\n': + break + try: + data = binascii.a2b_uu(s) + except binascii.Error as v: + # Workaround for broken uuencoders by /Fredrik Lundh + nbytes = (((ord(s[0])-32) & 63) * 4 + 5) / 3 + data = binascii.a2b_uu(s[:nbytes]) + #sys.stderr.write("Warning: %s\n" % str(v)) + write(data) + if not s: + raise ValueError('Truncated input data') + + return (outfile.getvalue(), len(input)) + +class Codec(codecs.Codec): + def encode(self, input, errors='strict'): + return uu_encode(input, errors) + + def decode(self, input, errors='strict'): + return uu_decode(input, errors) + +class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + return uu_encode(input, self.errors)[0] + +class IncrementalDecoder(codecs.IncrementalDecoder): + def decode(self, input, final=False): + return uu_decode(input, self.errors)[0] + +class StreamWriter(Codec, codecs.StreamWriter): + charbuffertype = bytes + +class StreamReader(Codec, codecs.StreamReader): + charbuffertype = bytes + +### encodings module API + +def getregentry(): + return codecs.CodecInfo( + name='uu', + encode=uu_encode, + decode=uu_decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, + ) diff --git a/Lib/encodings/zlib_codec.py b/Lib/encodings/zlib_codec.py new file mode 100644 index 00000000000..e0b9cdadbcd --- /dev/null +++ b/Lib/encodings/zlib_codec.py @@ -0,0 +1,77 @@ +"""Python 'zlib_codec' Codec - zlib compression encoding. + +This codec de/encodes from bytes to bytes and is therefore usable with +bytes.transform() and bytes.untransform(). + +Written by Marc-Andre Lemburg (mal@lemburg.com). +""" + +import codecs +import zlib # this codec needs the optional zlib module ! + +### Codec APIs + +def zlib_encode(input, errors='strict'): + assert errors == 'strict' + return (zlib.compress(input), len(input)) + +def zlib_decode(input, errors='strict'): + assert errors == 'strict' + return (zlib.decompress(input), len(input)) + +class Codec(codecs.Codec): + def encode(self, input, errors='strict'): + return zlib_encode(input, errors) + def decode(self, input, errors='strict'): + return zlib_decode(input, errors) + +class IncrementalEncoder(codecs.IncrementalEncoder): + def __init__(self, errors='strict'): + assert errors == 'strict' + self.errors = errors + self.compressobj = zlib.compressobj() + + def encode(self, input, final=False): + if final: + c = self.compressobj.compress(input) + return c + self.compressobj.flush() + else: + return self.compressobj.compress(input) + + def reset(self): + self.compressobj = zlib.compressobj() + +class IncrementalDecoder(codecs.IncrementalDecoder): + def __init__(self, errors='strict'): + assert errors == 'strict' + self.errors = errors + self.decompressobj = zlib.decompressobj() + + def decode(self, input, final=False): + if final: + c = self.decompressobj.decompress(input) + return c + self.decompressobj.flush() + else: + return self.decompressobj.decompress(input) + + def reset(self): + self.decompressobj = zlib.decompressobj() + +class StreamWriter(Codec, codecs.StreamWriter): + charbuffertype = bytes + +class StreamReader(Codec, codecs.StreamReader): + charbuffertype = bytes + +### encodings module API + +def getregentry(): + return codecs.CodecInfo( + name='zlib', + encode=zlib_encode, + decode=zlib_decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, + ) diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py index 24ee487019c..49b50f250d7 100644 --- a/Lib/test/test_bytes.py +++ b/Lib/test/test_bytes.py @@ -207,6 +207,11 @@ class BaseBytesTest(unittest.TestCase): self.assertEqual(b.decode(errors="ignore", encoding="utf8"), "Hello world\n") + def test_transform(self): + b1 = self.type2test(range(256)) + b2 = b1.transform("base64").untransform("base64") + self.assertEqual(b2, b1) + def test_from_int(self): b = self.type2test(0) self.assertEqual(b, self.type2test()) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index f989a552d77..bc29e06c4f5 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1659,6 +1659,67 @@ class BomTest(unittest.TestCase): self.assertEqual(f.read(), data * 2) +bytes_transform_encodings = [ + "base64_codec", + "uu_codec", + "quopri_codec", + "hex_codec", +] +try: + import zlib +except ImportError: + pass +else: + bytes_transform_encodings.append("zlib_codec") +try: + import bz2 +except ImportError: + pass +else: + bytes_transform_encodings.append("bz2_codec") + +class TransformCodecTest(unittest.TestCase): + def test_basics(self): + binput = bytes(range(256)) + ainput = bytearray(binput) + for encoding in bytes_transform_encodings: + # generic codecs interface + (o, size) = codecs.getencoder(encoding)(binput) + self.assertEqual(size, len(binput)) + (i, size) = codecs.getdecoder(encoding)(o) + self.assertEqual(size, len(o)) + self.assertEqual(i, binput) + + # transform interface + boutput = binput.transform(encoding) + aoutput = ainput.transform(encoding) + self.assertEqual(boutput, aoutput) + self.assertIsInstance(boutput, bytes) + self.assertIsInstance(aoutput, bytearray) + bback = boutput.untransform(encoding) + aback = aoutput.untransform(encoding) + self.assertEqual(bback, aback) + self.assertEqual(bback, binput) + self.assertIsInstance(bback, bytes) + self.assertIsInstance(aback, bytearray) + + def test_read(self): + for encoding in bytes_transform_encodings: + sin = b"\x80".transform(encoding) + reader = codecs.getreader(encoding)(io.BytesIO(sin)) + sout = reader.read() + self.assertEqual(sout, b"\x80") + + def test_readline(self): + for encoding in bytes_transform_encodings: + if encoding in ['uu_codec', 'zlib_codec']: + continue + sin = b"\x80".transform(encoding) + reader = codecs.getreader(encoding)(io.BytesIO(sin)) + sout = reader.readline() + self.assertEqual(sout, b"\x80") + + def test_main(): support.run_unittest( UTF32Test, @@ -1686,6 +1747,7 @@ def test_main(): TypesTest, SurrogateEscapeTest, BomTest, + TransformCodecTest, ) diff --git a/Misc/NEWS b/Misc/NEWS index f19b6f466a4..27965f1bad2 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,10 @@ What's New in Python 3.2 Beta 1? Core and Builtins ----------------- +- Issue #7475: Added transform() and untransform() methods to both bytes + and string types. They can be used to access those codecs providing + bytes-to-bytes and string-to-string mappings. + - Issue #8685: Speed up set difference ``a - b`` when source set ``a`` is much larger than operand ``b``. Patch by Andrew Bennetts. diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c index f419eee9559..6ca096afd72 100644 --- a/Objects/bytearrayobject.c +++ b/Objects/bytearrayobject.c @@ -2488,6 +2488,75 @@ bytearray_decode(PyObject *self, PyObject *args, PyObject *kwargs) return PyUnicode_FromEncodedObject(self, encoding, errors); } +PyDoc_STRVAR(transform__doc__, +"B.transform(encoding, errors='strict') -> bytearray\n\ +\n\ +Transform B using the codec registered for encoding. errors may be given\n\ +to set a different error handling scheme."); + +static PyObject * +bytearray_transform(PyObject *self, PyObject *args, PyObject *kwargs) +{ + const char *encoding = NULL; + const char *errors = NULL; + static char *kwlist[] = {"encoding", "errors", 0}; + PyObject *v, *w; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|s:transform", + kwlist, &encoding, &errors)) + return NULL; + + v = PyCodec_Encode(self, encoding, errors); + if (v == NULL) + return NULL; + if (!PyBytes_Check(v)) { + PyErr_Format(PyExc_TypeError, + "encoder did not return a bytes object (type=%.400s)", + Py_TYPE(v)->tp_name); + Py_DECREF(v); + return NULL; + } + w = PyByteArray_FromStringAndSize(PyBytes_AS_STRING(v), + PyBytes_GET_SIZE(v)); + Py_DECREF(v); + return w; +} + + +PyDoc_STRVAR(untransform__doc__, +"B.untransform(encoding, errors='strict') -> bytearray\n\ +\n\ +Reverse-transform B using the codec registered for encoding. errors may\n\ +be given to set a different error handling scheme."); + +static PyObject * +bytearray_untransform(PyObject *self, PyObject *args, PyObject *kwargs) +{ + const char *encoding = NULL; + const char *errors = NULL; + static char *kwlist[] = {"encoding", "errors", 0}; + PyObject *v, *w; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|s:untransform", + kwlist, &encoding, &errors)) + return NULL; + + v = PyCodec_Decode(self, encoding, errors); + if (v == NULL) + return NULL; + if (!PyBytes_Check(v)) { + PyErr_Format(PyExc_TypeError, + "decoder did not return a bytes object (type=%.400s)", + Py_TYPE(v)->tp_name); + Py_DECREF(v); + return NULL; + } + w = PyByteArray_FromStringAndSize(PyBytes_AS_STRING(v), + PyBytes_GET_SIZE(v)); + Py_DECREF(v); + return w; +} + PyDoc_STRVAR(alloc_doc, "B.__alloc__() -> int\n\ \n\ @@ -2782,8 +2851,12 @@ bytearray_methods[] = { {"swapcase", (PyCFunction)stringlib_swapcase, METH_NOARGS, _Py_swapcase__doc__}, {"title", (PyCFunction)stringlib_title, METH_NOARGS, _Py_title__doc__}, + {"transform", (PyCFunction)bytearray_transform, METH_VARARGS | METH_KEYWORDS, + transform__doc__}, {"translate", (PyCFunction)bytearray_translate, METH_VARARGS, translate__doc__}, + {"untransform", (PyCFunction)bytearray_untransform, METH_VARARGS | METH_KEYWORDS, + untransform__doc__}, {"upper", (PyCFunction)stringlib_upper, METH_NOARGS, _Py_upper__doc__}, {"zfill", (PyCFunction)stringlib_zfill, METH_VARARGS, zfill__doc__}, {NULL} diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 96134a3a3fb..d3b8a4fe804 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -2312,6 +2312,68 @@ bytes_decode(PyObject *self, PyObject *args, PyObject *kwargs) return PyUnicode_FromEncodedObject(self, encoding, errors); } +PyDoc_STRVAR(transform__doc__, +"B.transform(encoding, errors='strict') -> bytes\n\ +\n\ +Transform B using the codec registered for encoding. errors may be given\n\ +to set a different error handling scheme."); + +static PyObject * +bytes_transform(PyObject *self, PyObject *args, PyObject *kwargs) +{ + const char *encoding = NULL; + const char *errors = NULL; + static char *kwlist[] = {"encoding", "errors", 0}; + PyObject *v; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|s:transform", + kwlist, &encoding, &errors)) + return NULL; + + v = PyCodec_Encode(self, encoding, errors); + if (v == NULL) + return NULL; + if (!PyBytes_Check(v)) { + PyErr_Format(PyExc_TypeError, + "encoder did not return a bytes object (type=%.400s)", + Py_TYPE(v)->tp_name); + Py_DECREF(v); + return NULL; + } + return v; +} + + +PyDoc_STRVAR(untransform__doc__, +"B.untransform(encoding, errors='strict') -> bytes\n\ +\n\ +Reverse-transform B using the codec registered for encoding. errors may\n\ +be given to set a different error handling scheme."); + +static PyObject * +bytes_untransform(PyObject *self, PyObject *args, PyObject *kwargs) +{ + const char *encoding = NULL; + const char *errors = NULL; + static char *kwlist[] = {"encoding", "errors", 0}; + PyObject *v; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|s:untransform", + kwlist, &encoding, &errors)) + return NULL; + + v = PyCodec_Decode(self, encoding, errors); + if (v == NULL) + return NULL; + if (!PyBytes_Check(v)) { + PyErr_Format(PyExc_TypeError, + "decoder did not return a bytes object (type=%.400s)", + Py_TYPE(v)->tp_name); + Py_DECREF(v); + return NULL; + } + return v; +} PyDoc_STRVAR(splitlines__doc__, "B.splitlines([keepends]) -> list of lines\n\ @@ -2475,8 +2537,10 @@ bytes_methods[] = { {"swapcase", (PyCFunction)stringlib_swapcase, METH_NOARGS, _Py_swapcase__doc__}, {"title", (PyCFunction)stringlib_title, METH_NOARGS, _Py_title__doc__}, + {"transform", (PyCFunction)bytes_transform, METH_VARARGS | METH_KEYWORDS, transform__doc__}, {"translate", (PyCFunction)bytes_translate, METH_VARARGS, translate__doc__}, + {"untransform", (PyCFunction)bytes_untransform, METH_VARARGS | METH_KEYWORDS, untransform__doc__}, {"upper", (PyCFunction)stringlib_upper, METH_NOARGS, _Py_upper__doc__}, {"zfill", (PyCFunction)stringlib_zfill, METH_VARARGS, zfill__doc__}, {"__sizeof__", (PyCFunction)bytes_sizeof, METH_NOARGS, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 89e3c8afb91..fd508825ffc 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -7432,6 +7432,7 @@ unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); if (v == NULL) goto onError; + /* XXX this check is redundant */ if (!PyBytes_Check(v)) { PyErr_Format(PyExc_TypeError, "encoder did not return a bytes object " @@ -7446,6 +7447,44 @@ unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) return NULL; } +PyDoc_STRVAR(transform__doc__, + "S.transform(encoding, errors='strict') -> str\n\ +\n\ +Transform S using the codec registered for encoding. errors may be given\n\ +to set a different error handling scheme."); + +static PyObject * +unicode_transform(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) +{ + static char *kwlist[] = {"encoding", "errors", 0}; + char *encoding = NULL; + char *errors = NULL; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|s:transform", + kwlist, &encoding, &errors)) + return NULL; + return PyUnicode_AsEncodedUnicode((PyObject *)self, encoding, errors); +} + +PyDoc_STRVAR(untransform__doc__, + "S.untransform(encoding, errors='strict') -> str\n\ +\n\ +Reverse-transform S using the codec registered for encoding. errors may be\n\ +given to set a different error handling scheme."); + +static PyObject * +unicode_untransform(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) +{ + static char *kwlist[] = {"encoding", "errors", 0}; + char *encoding = NULL; + char *errors = NULL; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|s:untransform", + kwlist, &encoding, &errors)) + return NULL; + return PyUnicode_AsDecodedUnicode((PyObject *)self, encoding, errors); +} + PyDoc_STRVAR(expandtabs__doc__, "S.expandtabs([tabsize]) -> str\n\ \n\ @@ -9091,7 +9130,8 @@ static PyMethodDef unicode_methods[] = { /* Order is according to common usage: often used methods should appear first, since lookup is done sequentially. */ - {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, + {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, + encode__doc__}, {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, @@ -9136,6 +9176,10 @@ static PyMethodDef unicode_methods[] = { {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS | METH_STATIC, maketrans__doc__}, + {"transform", (PyCFunction) unicode_transform, METH_VARARGS | METH_KEYWORDS, + transform__doc__}, + {"untransform", (PyCFunction) unicode_untransform, METH_VARARGS | METH_KEYWORDS, + untransform__doc__}, {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, #if 0 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},