Module codecs -- Python Codec Registry, API and helpers. Written by
Marc-Andre Lemburg.
This commit is contained in:
parent
b5f2f1bb6f
commit
0612d84155
|
@ -0,0 +1,414 @@
|
|||
""" codecs -- Python Codec Registry, API and helpers.
|
||||
|
||||
|
||||
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
||||
|
||||
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
|
||||
|
||||
"""#"
|
||||
|
||||
import struct,types,__builtin__
|
||||
|
||||
### Registry and builtin stateless codec functions
|
||||
|
||||
from _codecs import *
|
||||
|
||||
### Constants
|
||||
|
||||
#
|
||||
# Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
|
||||
#
|
||||
BOM = struct.pack('=H',0xFEFF)
|
||||
#
|
||||
BOM_BE = BOM32_BE = '\376\377'
|
||||
# corresponds to Unicode U+FEFF in UTF-16 on big endian
|
||||
# platforms == ZERO WIDTH NO-BREAK SPACE
|
||||
BOM_LE = BOM32_LE = '\377\376'
|
||||
# corresponds to Unicode U+FFFE in UTF-16 on little endian
|
||||
# platforms == defined as being an illegal Unicode character
|
||||
|
||||
#
|
||||
# 64-bit Byte Order Marks
|
||||
#
|
||||
BOM64_BE = '\000\000\376\377'
|
||||
# corresponds to Unicode U+0000FEFF in UCS-4
|
||||
BOM64_LE = '\377\376\000\000'
|
||||
# corresponds to Unicode U+0000FFFE in UCS-4
|
||||
|
||||
|
||||
### Codec base classes (defining the API)
|
||||
|
||||
class Codec:
|
||||
|
||||
""" Defines the interface for stateless encoders/decoders.
|
||||
|
||||
The .encode()/.decode() methods may implement different error
|
||||
handling schemes by providing the errors argument. These
|
||||
string values are defined:
|
||||
|
||||
'strict' - raise an error (or a subclass)
|
||||
'ignore' - ignore the character and continue with the next
|
||||
'replace' - replace with a suitable replacement character;
|
||||
Python will use the official U+FFFD REPLACEMENT
|
||||
CHARACTER for the builtin Unicode codecs.
|
||||
|
||||
"""
|
||||
def encode(self,input,errors='strict'):
|
||||
|
||||
""" Encodes the object intput and returns a tuple (output
|
||||
object, length consumed).
|
||||
|
||||
errors defines the error handling to apply. It defaults to
|
||||
'strict' handling.
|
||||
|
||||
The method may not store state in the Codec instance. Use
|
||||
StreamCodec for codecs which have to keep state in order to
|
||||
make encoding/decoding efficient.
|
||||
|
||||
The encoder must be able to handle zero length input and
|
||||
return an empty object of the output object type in this
|
||||
situation.
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def decode(self,input,errors='strict'):
|
||||
|
||||
""" Decodes the object input and returns a tuple (output
|
||||
object, length consumed).
|
||||
|
||||
input must be an object which provides the bf_getreadbuf
|
||||
buffer slot. Python strings, buffer objects and memory
|
||||
mapped files are examples of objects providing this slot.
|
||||
|
||||
errors defines the error handling to apply. It defaults to
|
||||
'strict' handling.
|
||||
|
||||
The method may not store state in the Codec instance. Use
|
||||
StreamCodec for codecs which have to keep state in order to
|
||||
make encoding/decoding efficient.
|
||||
|
||||
The decoder must be able to handle zero length input and
|
||||
return an empty object of the output object type in this
|
||||
situation.
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
#
|
||||
# The StreamWriter and StreamReader class provide generic working
|
||||
# interfaces which can be used to implement new encodings submodules
|
||||
# very easily. See encodings/utf_8.py for an example on how this is
|
||||
# done.
|
||||
#
|
||||
|
||||
class StreamWriter(Codec):
|
||||
|
||||
def __init__(self,stream,errors='strict'):
|
||||
|
||||
""" Creates a StreamWriter instance.
|
||||
|
||||
stream must be a file-like object open for writing
|
||||
(binary) data.
|
||||
|
||||
The StreamWriter may implement different error handling
|
||||
schemes by providing the errors keyword argument. These
|
||||
parameters are defined:
|
||||
|
||||
'strict' - raise a ValueError (or a subclass)
|
||||
'ignore' - ignore the character and continue with the next
|
||||
'replace'- replace with a suitable replacement character
|
||||
|
||||
"""
|
||||
self.stream = stream
|
||||
self.errors = errors
|
||||
|
||||
def write(self,object):
|
||||
|
||||
""" Writes the object's contents encoded to self.stream.
|
||||
"""
|
||||
data, consumed = self.encode(object,self.errors)
|
||||
self.stream.write(data)
|
||||
|
||||
# XXX .writelines() ?
|
||||
|
||||
def reset(self):
|
||||
|
||||
""" Flushes and resets the codec buffers used for keeping state.
|
||||
|
||||
Calling this method should ensure that the data on the
|
||||
output is put into a clean state, that allows appending
|
||||
of new fresh data without having to rescan the whole
|
||||
stream to recover state.
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
def __getattr__(self,name,
|
||||
|
||||
getattr=getattr):
|
||||
|
||||
""" Inherit all other methods from the underlying stream.
|
||||
"""
|
||||
return getattr(self.stream,name)
|
||||
|
||||
###
|
||||
|
||||
class StreamReader(Codec):
|
||||
|
||||
def __init__(self,stream,errors='strict'):
|
||||
|
||||
""" Creates a StreamReader instance.
|
||||
|
||||
stream must be a file-like object open for reading
|
||||
(binary) data.
|
||||
|
||||
The StreamReader may implement different error handling
|
||||
schemes by providing the errors keyword argument. These
|
||||
parameters are defined:
|
||||
|
||||
'strict' - raise a ValueError (or a subclass)
|
||||
'ignore' - ignore the character and continue with the next
|
||||
'replace'- replace with a suitable replacement character;
|
||||
|
||||
"""
|
||||
self.stream = stream
|
||||
self.errors = errors
|
||||
|
||||
def read(self,size=-1):
|
||||
|
||||
""" Decodes data from the stream self.stream and returns the
|
||||
resulting object.
|
||||
|
||||
size indicates the approximate maximum number of bytes to
|
||||
read from the stream for decoding purposes. The decoder
|
||||
can modify this setting as appropriate. The default value
|
||||
-1 indicates to read and decode as much as possible. size
|
||||
is intended to prevent having to decode huge files in one
|
||||
step.
|
||||
|
||||
The method should use a greedy read strategy meaning that
|
||||
it should read as much data as is allowed within the
|
||||
definition of the encoding and the given size, e.g. if
|
||||
optional encoding endings or state markers are available
|
||||
on the stream, these should be read too.
|
||||
|
||||
"""
|
||||
# Unsliced reading:
|
||||
if size < 0:
|
||||
return self.decode(self.stream.read())[0]
|
||||
|
||||
# Sliced reading:
|
||||
read = self.stream.read
|
||||
decode = self.decode
|
||||
data = read(size)
|
||||
i = 0
|
||||
while 1:
|
||||
try:
|
||||
object, decodedbytes = decode(data)
|
||||
except ValueError,why:
|
||||
# This method is slow but should work under pretty much
|
||||
# all conditions; at most 10 tries are made
|
||||
i = i + 1
|
||||
newdata = read(1)
|
||||
if not newdata or i > 10:
|
||||
raise
|
||||
data = data + newdata
|
||||
else:
|
||||
return object
|
||||
|
||||
# XXX .readline() and .readlines() (these are hard to implement
|
||||
# without using buffers for keeping read-ahead data)
|
||||
|
||||
def reset(self):
|
||||
|
||||
""" Resets the codec buffers used for keeping state.
|
||||
|
||||
Note that no stream repositioning should take place.
|
||||
This method is primarely intended to be able to recover
|
||||
from decoding errors.
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
def __getattr__(self,name,
|
||||
|
||||
getattr=getattr):
|
||||
|
||||
""" Inherit all other methods from the underlying stream.
|
||||
"""
|
||||
return getattr(self.stream,name)
|
||||
|
||||
###
|
||||
|
||||
class StreamReaderWriter:
|
||||
|
||||
def __init__(self,stream,Reader,Writer,errors='strict'):
|
||||
|
||||
""" Creates a StreamReaderWriter instance.
|
||||
|
||||
stream must be a Stream-like object.
|
||||
|
||||
Reader, Writer must be factory functions or classes
|
||||
providing the StreamReader, StreamWriter interface resp.
|
||||
|
||||
Error handling is done in the same way as defined for the
|
||||
StreamWriter/Readers.
|
||||
|
||||
"""
|
||||
self.stream = stream
|
||||
self.reader = Reader(stream, errors)
|
||||
self.writer = Writer(stream, errors)
|
||||
self.errors = errors
|
||||
|
||||
def read(self,size=-1):
|
||||
|
||||
return self.reader.read(size)
|
||||
|
||||
def write(self,data):
|
||||
|
||||
return self.writer.write(data)
|
||||
|
||||
def reset(self):
|
||||
|
||||
self.reader.reset()
|
||||
self.writer.reset()
|
||||
|
||||
def __getattr__(self,name,
|
||||
|
||||
getattr=getattr):
|
||||
|
||||
""" Inherit all other methods from the underlying stream.
|
||||
"""
|
||||
return getattr(self.stream,name)
|
||||
|
||||
###
|
||||
|
||||
class StreamRecoder:
|
||||
|
||||
def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'):
|
||||
|
||||
""" Creates a StreamRecoder instance which implements a two-way
|
||||
conversion: encode and decode work on the frontend (the
|
||||
input to .read() and output of .write()) while
|
||||
Reader and Writer work on the backend (reading and
|
||||
writing to the the stream).
|
||||
|
||||
You can use these objects to do transparent direct
|
||||
recodings from e.g. latin-1 to utf-8 and back.
|
||||
|
||||
stream must be a file-like object.
|
||||
|
||||
encode, decode must adhere to the Codec interface, Reader,
|
||||
Writer must be factory functions or classes providing the
|
||||
StreamReader, StreamWriter interface resp.
|
||||
|
||||
encode and decode are needed for the frontend translation,
|
||||
Reader and Writer for the backend translation. Unicode is
|
||||
used as intermediate encoding.
|
||||
|
||||
Error handling is done in the same way as defined for the
|
||||
StreamWriter/Readers.
|
||||
|
||||
"""
|
||||
self.stream = stream
|
||||
self.encode = encode
|
||||
self.decode = decode
|
||||
self.reader = Reader(stream, errors)
|
||||
self.writer = Writer(stream, errors)
|
||||
self.errors = errors
|
||||
|
||||
def read(self,size=-1):
|
||||
|
||||
data = self.reader.read(size)
|
||||
data, bytesencoded = self.encode(data, self.errors)
|
||||
return data
|
||||
|
||||
def write(self,data):
|
||||
|
||||
data, bytesdecoded = self.decode(data, self.errors)
|
||||
return self.writer.write(data)
|
||||
|
||||
# .writelines(), .readline() and .readlines() ... see notes
|
||||
# above.
|
||||
|
||||
def reset(self):
|
||||
|
||||
self.reader.reset()
|
||||
self.writer.reset()
|
||||
|
||||
def __getattr__(self,name,
|
||||
|
||||
getattr=getattr):
|
||||
|
||||
""" Inherit all other methods from the underlying stream.
|
||||
"""
|
||||
return getattr(self.stream,name)
|
||||
|
||||
### Shortcuts
|
||||
|
||||
def open(filename, mode, encoding=None, errors='strict', buffering=1):
|
||||
|
||||
""" Open an encoded file using the given mode and return
|
||||
a wrapped version providing transparent encoding/decoding.
|
||||
|
||||
Note: The wrapped version will only accept the object format
|
||||
defined by the codecs, i.e. Unicode objects for most builtin
|
||||
codecs. Output is also codec dependent and will usually by
|
||||
Unicode as well.
|
||||
|
||||
encoding specifies the encoding which is to be used for the
|
||||
the file.
|
||||
|
||||
errors may be given to define the error handling. It defaults
|
||||
to 'strict' which causes ValueErrors to be raised in case an
|
||||
encoding error occurs.
|
||||
|
||||
buffering has the same meaning as for the builtin open() API.
|
||||
It defaults to line buffered.
|
||||
|
||||
"""
|
||||
if encoding is not None and \
|
||||
'b' not in mode:
|
||||
# Force opening of the file in binary mode
|
||||
mode = mode + 'b'
|
||||
file = __builtin__.open(filename, mode, buffering)
|
||||
if encoding is None:
|
||||
return file
|
||||
(e,d,sr,sw) = lookup(encoding)
|
||||
return StreamReaderWriter(file, sr, sw, errors)
|
||||
|
||||
def EncodedFile(file, input, output=None, errors='strict'):
|
||||
|
||||
""" Return a wrapped version of file which provides transparent
|
||||
encoding translation.
|
||||
|
||||
Strings written to the wrapped file are interpreted according
|
||||
to the given input encoding and then written to the original
|
||||
file as string using the output encoding. The intermediate
|
||||
encoding will usually be Unicode but depends on the specified
|
||||
codecs.
|
||||
|
||||
If output is not given, it defaults to input.
|
||||
|
||||
errors may be given to define the error handling. It defaults
|
||||
to 'strict' which causes ValueErrors to be raised in case an
|
||||
encoding error occurs.
|
||||
|
||||
"""
|
||||
if output is None:
|
||||
output = input
|
||||
encode, decode = lookup(input)[:2]
|
||||
Reader, Writer = lookup(output)[2:]
|
||||
return StreamRecoder(file,
|
||||
encode,decode,Reader,Writer,
|
||||
errors)
|
||||
|
||||
### Tests
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
import sys
|
||||
|
||||
# Make stdout translate Latin-1 into Unicode-Escape
|
||||
sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'unicode-escape')
|
Loading…
Reference in New Issue