Module codecs -- Python Codec Registry, API and helpers. Written by

Marc-Andre Lemburg.
2000-03-10 23:20:43 +00:00 · 2000-03-10 23:20:43 +00:00 · 0612d84155
parent b5f2f1bb6f
commit 0612d84155
1 changed files with 414 additions and 0 deletions
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@ -0,0 +1,414 @@
+""" codecs -- Python Codec Registry, API and helpers.
+
+
+Written by Marc-Andre Lemburg (mal@lemburg.com).
+
+(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
+
+"""#"
+
+import struct,types,__builtin__
+
+### Registry and builtin stateless codec functions
+
+from _codecs import *
+
+### Constants
+
+#
+# Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
+#
+BOM = struct.pack('=H',0xFEFF)
+#
+BOM_BE = BOM32_BE = '\376\377'
+#	corresponds to Unicode U+FEFF in UTF-16 on big endian
+#	platforms == ZERO WIDTH NO-BREAK SPACE
+BOM_LE = BOM32_LE = '\377\376' 
+#	corresponds to Unicode U+FFFE in UTF-16 on little endian
+#	platforms == defined as being an illegal Unicode character
+
+#
+# 64-bit Byte Order Marks
+#
+BOM64_BE = '\000\000\376\377'
+#	corresponds to Unicode U+0000FEFF in UCS-4
+BOM64_LE = '\377\376\000\000'
+#	corresponds to Unicode U+0000FFFE in UCS-4
+
+
+### Codec base classes (defining the API)
+
+class Codec:
+
+    """ Defines the interface for stateless encoders/decoders.
+
+        The .encode()/.decode() methods may implement different error
+        handling schemes by providing the errors argument. These
+        string values are defined:
+
+         'strict' - raise an error (or a subclass)
+         'ignore' - ignore the character and continue with the next
+         'replace' - replace with a suitable replacement character;
+                    Python will use the official U+FFFD REPLACEMENT
+                    CHARACTER for the builtin Unicode codecs.
+
+    """
+    def encode(self,input,errors='strict'):
+        
+        """ Encodes the object intput and returns a tuple (output
+            object, length consumed).
+
+            errors defines the error handling to apply. It defaults to
+            'strict' handling.
+
+            The method may not store state in the Codec instance. Use
+            StreamCodec for codecs which have to keep state in order to
+            make encoding/decoding efficient.
+
+            The encoder must be able to handle zero length input and
+            return an empty object of the output object type in this
+            situation.
+
+        """
+        raise NotImplementedError
+
+    def decode(self,input,errors='strict'):
+
+        """ Decodes the object input and returns a tuple (output
+            object, length consumed).
+
+            input must be an object which provides the bf_getreadbuf
+            buffer slot. Python strings, buffer objects and memory
+            mapped files are examples of objects providing this slot.
+        
+            errors defines the error handling to apply. It defaults to
+            'strict' handling.
+
+            The method may not store state in the Codec instance. Use
+            StreamCodec for codecs which have to keep state in order to
+            make encoding/decoding efficient.
+
+            The decoder must be able to handle zero length input and
+            return an empty object of the output object type in this
+            situation.
+
+        """ 
+        raise NotImplementedError
+
+#
+# The StreamWriter and StreamReader class provide generic working
+# interfaces which can be used to implement new encodings submodules
+# very easily. See encodings/utf_8.py for an example on how this is
+# done.
+# 
+
+class StreamWriter(Codec):
+
+    def __init__(self,stream,errors='strict'):
+
+        """ Creates a StreamWriter instance.
+
+            stream must be a file-like object open for writing
+            (binary) data.
+
+            The StreamWriter may implement different error handling
+            schemes by providing the errors keyword argument. These
+            parameters are defined:
+
+             'strict' - raise a ValueError (or a subclass)
+             'ignore' - ignore the character and continue with the next
+             'replace'- replace with a suitable replacement character
+
+        """
+        self.stream = stream
+        self.errors = errors
+
+    def write(self,object):
+
+        """ Writes the object's contents encoded to self.stream.
+        """
+        data, consumed = self.encode(object,self.errors)
+        self.stream.write(data)
+
+    # XXX .writelines() ?
+        
+    def reset(self):
+
+        """ Flushes and resets the codec buffers used for keeping state.
+
+            Calling this method should ensure that the data on the
+            output is put into a clean state, that allows appending
+            of new fresh data without having to rescan the whole
+            stream to recover state.
+
+        """
+        pass
+
+    def __getattr__(self,name,
+
+                    getattr=getattr):
+
+        """ Inherit all other methods from the underlying stream.
+        """
+        return getattr(self.stream,name)
+
+###
+
+class StreamReader(Codec):
+
+    def __init__(self,stream,errors='strict'):
+
+        """ Creates a StreamReader instance.
+
+            stream must be a file-like object open for reading
+            (binary) data.
+
+            The StreamReader may implement different error handling
+            schemes by providing the errors keyword argument. These
+            parameters are defined:
+
+             'strict' - raise a ValueError (or a subclass)
+             'ignore' - ignore the character and continue with the next
+             'replace'- replace with a suitable replacement character;
+
+        """
+        self.stream = stream
+        self.errors = errors
+
+    def read(self,size=-1):
+
+        """ Decodes data from the stream self.stream and returns the
+            resulting object.
+
+            size indicates the approximate maximum number of bytes to
+            read from the stream for decoding purposes. The decoder
+            can modify this setting as appropriate. The default value
+            -1 indicates to read and decode as much as possible.  size
+            is intended to prevent having to decode huge files in one
+            step.
+
+            The method should use a greedy read strategy meaning that
+            it should read as much data as is allowed within the
+            definition of the encoding and the given size, e.g.  if
+            optional encoding endings or state markers are available
+            on the stream, these should be read too.
+
+        """
+        # Unsliced reading:
+        if size < 0:
+            return self.decode(self.stream.read())[0]
+        
+        # Sliced reading:
+        read = self.stream.read
+        decode = self.decode
+        data = read(size)
+        i = 0
+        while 1:
+            try:
+                object, decodedbytes = decode(data)
+            except ValueError,why:
+                # This method is slow but should work under pretty much
+                # all conditions; at most 10 tries are made
+                i = i + 1
+                newdata = read(1)
+                if not newdata or i > 10:
+                    raise
+                data = data + newdata
+            else:
+                return object
+
+    # XXX .readline() and .readlines() (these are hard to implement
+    #     without using buffers for keeping read-ahead data)
+
+    def reset(self):
+
+        """ Resets the codec buffers used for keeping state.
+
+            Note that no stream repositioning should take place.
+            This method is primarely intended to be able to recover
+            from decoding errors.
+
+        """
+        pass
+
+    def __getattr__(self,name,
+
+                    getattr=getattr):
+
+        """ Inherit all other methods from the underlying stream.
+        """
+        return getattr(self.stream,name)
+
+###
+
+class StreamReaderWriter:
+
+    def __init__(self,stream,Reader,Writer,errors='strict'):
+
+        """ Creates a StreamReaderWriter instance.
+
+            stream must be a Stream-like object.
+
+            Reader, Writer must be factory functions or classes
+            providing the StreamReader, StreamWriter interface resp.
+
+            Error handling is done in the same way as defined for the
+            StreamWriter/Readers.
+
+        """
+        self.stream = stream
+        self.reader = Reader(stream, errors)
+        self.writer = Writer(stream, errors)
+        self.errors = errors
+
+    def read(self,size=-1):
+
+        return self.reader.read(size)
+
+    def write(self,data):
+
+        return self.writer.write(data)
+
+    def reset(self):
+
+        self.reader.reset()
+        self.writer.reset()
+
+    def __getattr__(self,name,
+
+                    getattr=getattr):
+
+        """ Inherit all other methods from the underlying stream.
+        """
+        return getattr(self.stream,name)
+
+###
+
+class StreamRecoder:
+
+    def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'):
+
+        """ Creates a StreamRecoder instance which implements a two-way
+            conversion: encode and decode work on the frontend (the
+            input to .read() and output of .write()) while 
+            Reader and Writer work on the backend (reading and
+            writing to the the stream).
+
+            You can use these objects to do transparent direct
+            recodings from e.g. latin-1 to utf-8 and back.
+
+            stream must be a file-like object.
+
+            encode, decode must adhere to the Codec interface, Reader,
+            Writer must be factory functions or classes providing the
+            StreamReader, StreamWriter interface resp.
+
+            encode and decode are needed for the frontend translation,
+            Reader and Writer for the backend translation. Unicode is
+            used as intermediate encoding.
+
+            Error handling is done in the same way as defined for the
+            StreamWriter/Readers.
+
+        """
+        self.stream = stream
+        self.encode = encode
+        self.decode = decode
+        self.reader = Reader(stream, errors)
+        self.writer = Writer(stream, errors)
+        self.errors = errors
+
+    def read(self,size=-1):
+
+        data = self.reader.read(size)
+        data, bytesencoded = self.encode(data, self.errors)
+        return data
+
+    def write(self,data):
+
+        data, bytesdecoded = self.decode(data, self.errors)
+        return self.writer.write(data)
+
+    # .writelines(), .readline() and .readlines() ... see notes
+    # above.
+
+    def reset(self):
+
+        self.reader.reset()
+        self.writer.reset()
+
+    def __getattr__(self,name,
+
+                    getattr=getattr):
+
+        """ Inherit all other methods from the underlying stream.
+        """
+        return getattr(self.stream,name)
+
+### Shortcuts
+
+def open(filename, mode, encoding=None, errors='strict', buffering=1):
+
+    """ Open an encoded file using the given mode and return
+        a wrapped version providing transparent encoding/decoding.
+
+        Note: The wrapped version will only accept the object format
+        defined by the codecs, i.e. Unicode objects for most builtin
+        codecs. Output is also codec dependent and will usually by
+        Unicode as well.
+
+        encoding specifies the encoding which is to be used for the
+        the file.
+
+        errors may be given to define the error handling. It defaults
+        to 'strict' which causes ValueErrors to be raised in case an
+        encoding error occurs.
+
+        buffering has the same meaning as for the builtin open() API.
+        It defaults to line buffered.
+
+    """
+    if encoding is not None and \
+       'b' not in mode:
+        # Force opening of the file in binary mode
+        mode = mode + 'b'
+    file = __builtin__.open(filename, mode, buffering)
+    if encoding is None:
+        return file
+    (e,d,sr,sw) = lookup(encoding)
+    return StreamReaderWriter(file, sr, sw, errors)
+
+def EncodedFile(file, input, output=None, errors='strict'):
+
+    """ Return a wrapped version of file which provides transparent
+        encoding translation.
+
+        Strings written to the wrapped file are interpreted according
+        to the given input encoding and then written to the original
+        file as string using the output encoding. The intermediate
+        encoding will usually be Unicode but depends on the specified
+        codecs.
+
+        If output is not given, it defaults to input.
+
+        errors may be given to define the error handling. It defaults
+        to 'strict' which causes ValueErrors to be raised in case an
+        encoding error occurs.
+
+    """
+    if output is None:
+        output = input
+    encode, decode = lookup(input)[:2]
+    Reader, Writer = lookup(output)[2:]
+    return StreamRecoder(file,
+                         encode,decode,Reader,Writer,
+                         errors)
+
+### Tests
+    
+if __name__ == '__main__':
+
+    import sys
+    
+    # Make stdout translate Latin-1 into Unicode-Escape
+    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'unicode-escape')