415 lines
12 KiB
Python
415 lines
12 KiB
Python
""" codecs -- Python Codec Registry, API and helpers.
|
|
|
|
|
|
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
|
|
|
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
|
|
|
|
"""#"
|
|
|
|
import struct,types,__builtin__
|
|
|
|
### Registry and builtin stateless codec functions
|
|
|
|
from _codecs import *
|
|
|
|
### Constants
|
|
|
|
#
|
|
# Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
|
|
#
|
|
BOM = struct.pack('=H',0xFEFF)
|
|
#
|
|
BOM_BE = BOM32_BE = '\376\377'
|
|
# corresponds to Unicode U+FEFF in UTF-16 on big endian
|
|
# platforms == ZERO WIDTH NO-BREAK SPACE
|
|
BOM_LE = BOM32_LE = '\377\376'
|
|
# corresponds to Unicode U+FFFE in UTF-16 on little endian
|
|
# platforms == defined as being an illegal Unicode character
|
|
|
|
#
|
|
# 64-bit Byte Order Marks
|
|
#
|
|
BOM64_BE = '\000\000\376\377'
|
|
# corresponds to Unicode U+0000FEFF in UCS-4
|
|
BOM64_LE = '\377\376\000\000'
|
|
# corresponds to Unicode U+0000FFFE in UCS-4
|
|
|
|
|
|
### Codec base classes (defining the API)
|
|
|
|
class Codec:
|
|
|
|
""" Defines the interface for stateless encoders/decoders.
|
|
|
|
The .encode()/.decode() methods may implement different error
|
|
handling schemes by providing the errors argument. These
|
|
string values are defined:
|
|
|
|
'strict' - raise an error (or a subclass)
|
|
'ignore' - ignore the character and continue with the next
|
|
'replace' - replace with a suitable replacement character;
|
|
Python will use the official U+FFFD REPLACEMENT
|
|
CHARACTER for the builtin Unicode codecs.
|
|
|
|
"""
|
|
def encode(self,input,errors='strict'):
|
|
|
|
""" Encodes the object intput and returns a tuple (output
|
|
object, length consumed).
|
|
|
|
errors defines the error handling to apply. It defaults to
|
|
'strict' handling.
|
|
|
|
The method may not store state in the Codec instance. Use
|
|
StreamCodec for codecs which have to keep state in order to
|
|
make encoding/decoding efficient.
|
|
|
|
The encoder must be able to handle zero length input and
|
|
return an empty object of the output object type in this
|
|
situation.
|
|
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
def decode(self,input,errors='strict'):
|
|
|
|
""" Decodes the object input and returns a tuple (output
|
|
object, length consumed).
|
|
|
|
input must be an object which provides the bf_getreadbuf
|
|
buffer slot. Python strings, buffer objects and memory
|
|
mapped files are examples of objects providing this slot.
|
|
|
|
errors defines the error handling to apply. It defaults to
|
|
'strict' handling.
|
|
|
|
The method may not store state in the Codec instance. Use
|
|
StreamCodec for codecs which have to keep state in order to
|
|
make encoding/decoding efficient.
|
|
|
|
The decoder must be able to handle zero length input and
|
|
return an empty object of the output object type in this
|
|
situation.
|
|
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
#
|
|
# The StreamWriter and StreamReader class provide generic working
|
|
# interfaces which can be used to implement new encodings submodules
|
|
# very easily. See encodings/utf_8.py for an example on how this is
|
|
# done.
|
|
#
|
|
|
|
class StreamWriter(Codec):
|
|
|
|
def __init__(self,stream,errors='strict'):
|
|
|
|
""" Creates a StreamWriter instance.
|
|
|
|
stream must be a file-like object open for writing
|
|
(binary) data.
|
|
|
|
The StreamWriter may implement different error handling
|
|
schemes by providing the errors keyword argument. These
|
|
parameters are defined:
|
|
|
|
'strict' - raise a ValueError (or a subclass)
|
|
'ignore' - ignore the character and continue with the next
|
|
'replace'- replace with a suitable replacement character
|
|
|
|
"""
|
|
self.stream = stream
|
|
self.errors = errors
|
|
|
|
def write(self,object):
|
|
|
|
""" Writes the object's contents encoded to self.stream.
|
|
"""
|
|
data, consumed = self.encode(object,self.errors)
|
|
self.stream.write(data)
|
|
|
|
# XXX .writelines() ?
|
|
|
|
def reset(self):
|
|
|
|
""" Flushes and resets the codec buffers used for keeping state.
|
|
|
|
Calling this method should ensure that the data on the
|
|
output is put into a clean state, that allows appending
|
|
of new fresh data without having to rescan the whole
|
|
stream to recover state.
|
|
|
|
"""
|
|
pass
|
|
|
|
def __getattr__(self,name,
|
|
|
|
getattr=getattr):
|
|
|
|
""" Inherit all other methods from the underlying stream.
|
|
"""
|
|
return getattr(self.stream,name)
|
|
|
|
###
|
|
|
|
class StreamReader(Codec):
|
|
|
|
def __init__(self,stream,errors='strict'):
|
|
|
|
""" Creates a StreamReader instance.
|
|
|
|
stream must be a file-like object open for reading
|
|
(binary) data.
|
|
|
|
The StreamReader may implement different error handling
|
|
schemes by providing the errors keyword argument. These
|
|
parameters are defined:
|
|
|
|
'strict' - raise a ValueError (or a subclass)
|
|
'ignore' - ignore the character and continue with the next
|
|
'replace'- replace with a suitable replacement character;
|
|
|
|
"""
|
|
self.stream = stream
|
|
self.errors = errors
|
|
|
|
def read(self,size=-1):
|
|
|
|
""" Decodes data from the stream self.stream and returns the
|
|
resulting object.
|
|
|
|
size indicates the approximate maximum number of bytes to
|
|
read from the stream for decoding purposes. The decoder
|
|
can modify this setting as appropriate. The default value
|
|
-1 indicates to read and decode as much as possible. size
|
|
is intended to prevent having to decode huge files in one
|
|
step.
|
|
|
|
The method should use a greedy read strategy meaning that
|
|
it should read as much data as is allowed within the
|
|
definition of the encoding and the given size, e.g. if
|
|
optional encoding endings or state markers are available
|
|
on the stream, these should be read too.
|
|
|
|
"""
|
|
# Unsliced reading:
|
|
if size < 0:
|
|
return self.decode(self.stream.read())[0]
|
|
|
|
# Sliced reading:
|
|
read = self.stream.read
|
|
decode = self.decode
|
|
data = read(size)
|
|
i = 0
|
|
while 1:
|
|
try:
|
|
object, decodedbytes = decode(data)
|
|
except ValueError,why:
|
|
# This method is slow but should work under pretty much
|
|
# all conditions; at most 10 tries are made
|
|
i = i + 1
|
|
newdata = read(1)
|
|
if not newdata or i > 10:
|
|
raise
|
|
data = data + newdata
|
|
else:
|
|
return object
|
|
|
|
# XXX .readline() and .readlines() (these are hard to implement
|
|
# without using buffers for keeping read-ahead data)
|
|
|
|
def reset(self):
|
|
|
|
""" Resets the codec buffers used for keeping state.
|
|
|
|
Note that no stream repositioning should take place.
|
|
This method is primarely intended to be able to recover
|
|
from decoding errors.
|
|
|
|
"""
|
|
pass
|
|
|
|
def __getattr__(self,name,
|
|
|
|
getattr=getattr):
|
|
|
|
""" Inherit all other methods from the underlying stream.
|
|
"""
|
|
return getattr(self.stream,name)
|
|
|
|
###
|
|
|
|
class StreamReaderWriter:
|
|
|
|
def __init__(self,stream,Reader,Writer,errors='strict'):
|
|
|
|
""" Creates a StreamReaderWriter instance.
|
|
|
|
stream must be a Stream-like object.
|
|
|
|
Reader, Writer must be factory functions or classes
|
|
providing the StreamReader, StreamWriter interface resp.
|
|
|
|
Error handling is done in the same way as defined for the
|
|
StreamWriter/Readers.
|
|
|
|
"""
|
|
self.stream = stream
|
|
self.reader = Reader(stream, errors)
|
|
self.writer = Writer(stream, errors)
|
|
self.errors = errors
|
|
|
|
def read(self,size=-1):
|
|
|
|
return self.reader.read(size)
|
|
|
|
def write(self,data):
|
|
|
|
return self.writer.write(data)
|
|
|
|
def reset(self):
|
|
|
|
self.reader.reset()
|
|
self.writer.reset()
|
|
|
|
def __getattr__(self,name,
|
|
|
|
getattr=getattr):
|
|
|
|
""" Inherit all other methods from the underlying stream.
|
|
"""
|
|
return getattr(self.stream,name)
|
|
|
|
###
|
|
|
|
class StreamRecoder:
|
|
|
|
def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'):
|
|
|
|
""" Creates a StreamRecoder instance which implements a two-way
|
|
conversion: encode and decode work on the frontend (the
|
|
input to .read() and output of .write()) while
|
|
Reader and Writer work on the backend (reading and
|
|
writing to the the stream).
|
|
|
|
You can use these objects to do transparent direct
|
|
recodings from e.g. latin-1 to utf-8 and back.
|
|
|
|
stream must be a file-like object.
|
|
|
|
encode, decode must adhere to the Codec interface, Reader,
|
|
Writer must be factory functions or classes providing the
|
|
StreamReader, StreamWriter interface resp.
|
|
|
|
encode and decode are needed for the frontend translation,
|
|
Reader and Writer for the backend translation. Unicode is
|
|
used as intermediate encoding.
|
|
|
|
Error handling is done in the same way as defined for the
|
|
StreamWriter/Readers.
|
|
|
|
"""
|
|
self.stream = stream
|
|
self.encode = encode
|
|
self.decode = decode
|
|
self.reader = Reader(stream, errors)
|
|
self.writer = Writer(stream, errors)
|
|
self.errors = errors
|
|
|
|
def read(self,size=-1):
|
|
|
|
data = self.reader.read(size)
|
|
data, bytesencoded = self.encode(data, self.errors)
|
|
return data
|
|
|
|
def write(self,data):
|
|
|
|
data, bytesdecoded = self.decode(data, self.errors)
|
|
return self.writer.write(data)
|
|
|
|
# .writelines(), .readline() and .readlines() ... see notes
|
|
# above.
|
|
|
|
def reset(self):
|
|
|
|
self.reader.reset()
|
|
self.writer.reset()
|
|
|
|
def __getattr__(self,name,
|
|
|
|
getattr=getattr):
|
|
|
|
""" Inherit all other methods from the underlying stream.
|
|
"""
|
|
return getattr(self.stream,name)
|
|
|
|
### Shortcuts
|
|
|
|
def open(filename, mode, encoding=None, errors='strict', buffering=1):
|
|
|
|
""" Open an encoded file using the given mode and return
|
|
a wrapped version providing transparent encoding/decoding.
|
|
|
|
Note: The wrapped version will only accept the object format
|
|
defined by the codecs, i.e. Unicode objects for most builtin
|
|
codecs. Output is also codec dependent and will usually by
|
|
Unicode as well.
|
|
|
|
encoding specifies the encoding which is to be used for the
|
|
the file.
|
|
|
|
errors may be given to define the error handling. It defaults
|
|
to 'strict' which causes ValueErrors to be raised in case an
|
|
encoding error occurs.
|
|
|
|
buffering has the same meaning as for the builtin open() API.
|
|
It defaults to line buffered.
|
|
|
|
"""
|
|
if encoding is not None and \
|
|
'b' not in mode:
|
|
# Force opening of the file in binary mode
|
|
mode = mode + 'b'
|
|
file = __builtin__.open(filename, mode, buffering)
|
|
if encoding is None:
|
|
return file
|
|
(e,d,sr,sw) = lookup(encoding)
|
|
return StreamReaderWriter(file, sr, sw, errors)
|
|
|
|
def EncodedFile(file, input, output=None, errors='strict'):
|
|
|
|
""" Return a wrapped version of file which provides transparent
|
|
encoding translation.
|
|
|
|
Strings written to the wrapped file are interpreted according
|
|
to the given input encoding and then written to the original
|
|
file as string using the output encoding. The intermediate
|
|
encoding will usually be Unicode but depends on the specified
|
|
codecs.
|
|
|
|
If output is not given, it defaults to input.
|
|
|
|
errors may be given to define the error handling. It defaults
|
|
to 'strict' which causes ValueErrors to be raised in case an
|
|
encoding error occurs.
|
|
|
|
"""
|
|
if output is None:
|
|
output = input
|
|
encode, decode = lookup(input)[:2]
|
|
Reader, Writer = lookup(output)[2:]
|
|
return StreamRecoder(file,
|
|
encode,decode,Reader,Writer,
|
|
errors)
|
|
|
|
### Tests
|
|
|
|
if __name__ == '__main__':
|
|
|
|
import sys
|
|
|
|
# Make stdout translate Latin-1 into Unicode-Escape
|
|
sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'unicode-escape')
|