Apply SF patch #1698994: Add getstate() and setstate()
methods to incrementalcodecs. Also forward port r54786 (fix the incremental utf_8_sig decoder).
This commit is contained in:
parent
8981ad05c0
commit
3abcb013b8
|
@ -405,6 +405,21 @@ define in order to be compatible with the Python codec registry.
|
|||
Reset the encoder to the initial state.
|
||||
\end{methoddesc}
|
||||
|
||||
\begin{methoddesc}{getstate}{}
|
||||
Return the current state of the encoder which must be an integer.
|
||||
The implementation should make sure that \code{0} is the most common state.
|
||||
(States that are more complicated than integers can be converted into an
|
||||
integer by marshaling/pickling the state and encoding the bytes of the
|
||||
resulting string into an integer).
|
||||
\versionadded{3.0}
|
||||
\end{methoddesc}
|
||||
|
||||
\begin{methoddesc}{setstate}{state}
|
||||
Set the state of the encoder to \var{state}. \var{state} must be an
|
||||
encoder state returned by \method{getstate}.
|
||||
\versionadded{3.0}
|
||||
\end{methoddesc}
|
||||
|
||||
|
||||
\subsubsection{IncrementalDecoder Objects \label{incremental-decoder-objects}}
|
||||
|
||||
|
@ -453,6 +468,27 @@ define in order to be compatible with the Python codec registry.
|
|||
Reset the decoder to the initial state.
|
||||
\end{methoddesc}
|
||||
|
||||
\begin{methoddesc}{getstate}{}
|
||||
Return the current state of the decoder. This must be a tuple with two
|
||||
items, the first must be the buffer containing the still undecoded input.
|
||||
The second must be an integer and can be additional state info.
|
||||
(The implementation should make sure that \code{0} is the most common
|
||||
additional state info.) If this additional state info is \code{0} it must
|
||||
be possible to set the decoder to the state which has no input buffered
|
||||
and \code{0} as the additional state info, so that feeding the previously
|
||||
buffered input to the decoder returns it to the previous state without
|
||||
producing any output. (Additional state info that is more complicated
|
||||
than integers can be converted into an integer by marshaling/pickling
|
||||
the info and encoding the bytes of the resulting string into an integer.)
|
||||
\versionadded{3.0}
|
||||
\end{methoddesc}
|
||||
|
||||
\begin{methoddesc}{setstate}{state}
|
||||
Set the state of the encoder to \var{state}. \var{state} must be a
|
||||
decoder state returned by \method{getstate}.
|
||||
\versionadded{3.0}
|
||||
\end{methoddesc}
|
||||
|
||||
|
||||
The \class{StreamWriter} and \class{StreamReader} classes provide
|
||||
generic working interfaces which can be used to implement new
|
||||
|
|
|
@ -87,7 +87,9 @@ class CodecInfo(tuple):
|
|||
return self
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
|
||||
return "<%s.%s object for encoding %s at 0x%x>" % \
|
||||
(self.__class__.__module__, self.__class__.__name__,
|
||||
self.name, id(self))
|
||||
|
||||
class Codec:
|
||||
|
||||
|
@ -155,9 +157,9 @@ class Codec:
|
|||
|
||||
class IncrementalEncoder(object):
|
||||
"""
|
||||
An IncrementalEncoder encodes an input in multiple steps. The input can be
|
||||
passed piece by piece to the encode() method. The IncrementalEncoder remembers
|
||||
the state of the Encoding process between calls to encode().
|
||||
An IncrementalEncoder encodes an input in multiple steps. The input can
|
||||
be passed piece by piece to the encode() method. The IncrementalEncoder
|
||||
remembers the state of the encoding process between calls to encode().
|
||||
"""
|
||||
def __init__(self, errors='strict'):
|
||||
"""
|
||||
|
@ -181,6 +183,18 @@ class IncrementalEncoder(object):
|
|||
Resets the encoder to the initial state.
|
||||
"""
|
||||
|
||||
def getstate(self):
|
||||
"""
|
||||
Return the current state of the encoder.
|
||||
"""
|
||||
return 0
|
||||
|
||||
def setstate(self, state):
|
||||
"""
|
||||
Set the current state of the encoder. state must have been
|
||||
returned by getstate().
|
||||
"""
|
||||
|
||||
class BufferedIncrementalEncoder(IncrementalEncoder):
|
||||
"""
|
||||
This subclass of IncrementalEncoder can be used as the baseclass for an
|
||||
|
@ -189,7 +203,8 @@ class BufferedIncrementalEncoder(IncrementalEncoder):
|
|||
"""
|
||||
def __init__(self, errors='strict'):
|
||||
IncrementalEncoder.__init__(self, errors)
|
||||
self.buffer = "" # unencoded input that is kept between calls to encode()
|
||||
# unencoded input that is kept between calls to encode()
|
||||
self.buffer = ""
|
||||
|
||||
def _buffer_encode(self, input, errors, final):
|
||||
# Overwrite this method in subclasses: It must encode input
|
||||
|
@ -208,10 +223,16 @@ class BufferedIncrementalEncoder(IncrementalEncoder):
|
|||
IncrementalEncoder.reset(self)
|
||||
self.buffer = ""
|
||||
|
||||
def getstate(self):
|
||||
return self.buffer or 0
|
||||
|
||||
def setstate(self, state):
|
||||
self.buffer = state or ""
|
||||
|
||||
class IncrementalDecoder(object):
|
||||
"""
|
||||
An IncrementalDecoder decodes an input in multiple steps. The input can be
|
||||
passed piece by piece to the decode() method. The IncrementalDecoder
|
||||
An IncrementalDecoder decodes an input in multiple steps. The input can
|
||||
be passed piece by piece to the decode() method. The IncrementalDecoder
|
||||
remembers the state of the decoding process between calls to decode().
|
||||
"""
|
||||
def __init__(self, errors='strict'):
|
||||
|
@ -235,15 +256,29 @@ class IncrementalDecoder(object):
|
|||
Resets the decoder to the initial state.
|
||||
"""
|
||||
|
||||
def getstate(self):
|
||||
"""
|
||||
Return the current state of the decoder. This must be a
|
||||
(buffered_input, additional_state_info) tuple.
|
||||
"""
|
||||
return ("", 0)
|
||||
|
||||
def setstate(self, state):
|
||||
"""
|
||||
Set the current state of the decoder. state must have been
|
||||
returned by getstate().
|
||||
"""
|
||||
|
||||
class BufferedIncrementalDecoder(IncrementalDecoder):
|
||||
"""
|
||||
This subclass of IncrementalDecoder can be used as the baseclass for an
|
||||
incremental decoder if the decoder must be able to handle incomplete byte
|
||||
sequences.
|
||||
incremental decoder if the decoder must be able to handle incomplete
|
||||
byte sequences.
|
||||
"""
|
||||
def __init__(self, errors='strict'):
|
||||
IncrementalDecoder.__init__(self, errors)
|
||||
self.buffer = "" # undecoded input that is kept between calls to decode()
|
||||
# undecoded input that is kept between calls to decode()
|
||||
self.buffer = ""
|
||||
|
||||
def _buffer_decode(self, input, errors, final):
|
||||
# Overwrite this method in subclasses: It must decode input
|
||||
|
@ -262,6 +297,14 @@ class BufferedIncrementalDecoder(IncrementalDecoder):
|
|||
IncrementalDecoder.reset(self)
|
||||
self.buffer = ""
|
||||
|
||||
def getstate(self):
|
||||
# additional state info is always 0
|
||||
return (self.buffer, 0)
|
||||
|
||||
def setstate(self, state):
|
||||
# ignore additional state info
|
||||
self.buffer = state[0]
|
||||
|
||||
#
|
||||
# The StreamWriter and StreamReader class provide generic working
|
||||
# interfaces which can be used to implement new encoding submodules
|
||||
|
@ -424,7 +467,8 @@ class StreamReader(Codec):
|
|||
newchars, decodedbytes = self.decode(data, self.errors)
|
||||
except UnicodeDecodeError as exc:
|
||||
if firstline:
|
||||
newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
|
||||
newchars, decodedbytes = \
|
||||
self.decode(data[:exc.start], self.errors)
|
||||
lines = newchars.splitlines(True)
|
||||
if len(lines)<=1:
|
||||
raise
|
||||
|
|
|
@ -34,6 +34,22 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
|
|||
codecs.IncrementalEncoder.reset(self)
|
||||
self.encoder = None
|
||||
|
||||
def getstate(self):
|
||||
# state info we return to the caller:
|
||||
# 0: stream is in natural order for this platform
|
||||
# 2: endianness hasn't been determined yet
|
||||
# (we're never writing in unnatural order)
|
||||
return (2 if self.encoder is None else 0)
|
||||
|
||||
def setstate(self, state):
|
||||
if state:
|
||||
self.encoder = None
|
||||
else:
|
||||
if sys.byteorder == 'little':
|
||||
self.encoder = codecs.utf_16_le_encode
|
||||
else:
|
||||
self.encoder = codecs.utf_16_be_encode
|
||||
|
||||
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
||||
def __init__(self, errors='strict'):
|
||||
codecs.BufferedIncrementalDecoder.__init__(self, errors)
|
||||
|
@ -56,6 +72,35 @@ class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
|||
codecs.BufferedIncrementalDecoder.reset(self)
|
||||
self.decoder = None
|
||||
|
||||
def getstate(self):
|
||||
# additonal state info from the base class must be None here,
|
||||
# as it isn't passed along to the caller
|
||||
state = codecs.BufferedIncrementalDecoder.getstate(self)[0]
|
||||
# additional state info we pass to the caller:
|
||||
# 0: stream is in natural order for this platform
|
||||
# 1: stream is in unnatural order
|
||||
# 2: endianness hasn't been determined yet
|
||||
if self.decoder is None:
|
||||
return (state, 2)
|
||||
addstate = int((sys.byteorder == "big") !=
|
||||
(self.decoder is codecs.utf_16_be_decode))
|
||||
return (state, addstate)
|
||||
|
||||
def setstate(self, state):
|
||||
# state[1] will be ignored by BufferedIncrementalDecoder.setstate()
|
||||
codecs.BufferedIncrementalDecoder.setstate(self, state)
|
||||
state = state[1]
|
||||
if state == 0:
|
||||
self.decoder = (codecs.utf_16_be_decode
|
||||
if sys.byteorder == "big"
|
||||
else codecs.utf_16_le_decode)
|
||||
elif state == 1:
|
||||
self.decoder = (codecs.utf_16_le_decode
|
||||
if sys.byteorder == "big"
|
||||
else codecs.utf_16_be_decode)
|
||||
else:
|
||||
self.decoder = None
|
||||
|
||||
class StreamWriter(codecs.StreamWriter):
|
||||
def __init__(self, stream, errors='strict'):
|
||||
self.bom_written = False
|
||||
|
|
|
@ -12,7 +12,8 @@ import codecs
|
|||
### Codec APIs
|
||||
|
||||
def encode(input, errors='strict'):
|
||||
return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
|
||||
return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0],
|
||||
len(input))
|
||||
|
||||
def decode(input, errors='strict'):
|
||||
prefix = 0
|
||||
|
@ -25,38 +26,61 @@ def decode(input, errors='strict'):
|
|||
class IncrementalEncoder(codecs.IncrementalEncoder):
|
||||
def __init__(self, errors='strict'):
|
||||
codecs.IncrementalEncoder.__init__(self, errors)
|
||||
self.first = True
|
||||
self.first = 1
|
||||
|
||||
def encode(self, input, final=False):
|
||||
if self.first:
|
||||
self.first = False
|
||||
return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0]
|
||||
self.first = 0
|
||||
return codecs.BOM_UTF8 + \
|
||||
codecs.utf_8_encode(input, self.errors)[0]
|
||||
else:
|
||||
return codecs.utf_8_encode(input, self.errors)[0]
|
||||
|
||||
def reset(self):
|
||||
codecs.IncrementalEncoder.reset(self)
|
||||
self.first = True
|
||||
self.first = 1
|
||||
|
||||
def getstate(self):
|
||||
return self.first
|
||||
|
||||
def setstate(self, state):
|
||||
self.first = state
|
||||
|
||||
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
||||
def __init__(self, errors='strict'):
|
||||
codecs.BufferedIncrementalDecoder.__init__(self, errors)
|
||||
self.first = True
|
||||
self.first = 1
|
||||
|
||||
def _buffer_decode(self, input, errors, final):
|
||||
if self.first and codecs.BOM_UTF8.startswith(input): # might be a BOM
|
||||
if self.first:
|
||||
if len(input) < 3:
|
||||
# not enough data to decide if this really is a BOM
|
||||
# => try again on the next call
|
||||
return (u"", 0)
|
||||
(output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
|
||||
self.first = False
|
||||
return (output, consumed+3)
|
||||
if codecs.BOM_UTF8.startswith(input):
|
||||
# not enough data to decide if this really is a BOM
|
||||
# => try again on the next call
|
||||
return (u"", 0)
|
||||
else:
|
||||
self.first = 0
|
||||
else:
|
||||
self.first = 0
|
||||
if input[:3] == codecs.BOM_UTF8:
|
||||
(output, consumed) = \
|
||||
codecs.utf_8_decode(input[3:], errors, final)
|
||||
return (output, consumed+3)
|
||||
return codecs.utf_8_decode(input, errors, final)
|
||||
|
||||
def reset(self):
|
||||
codecs.BufferedIncrementalDecoder.reset(self)
|
||||
self.first = True
|
||||
self.first = 1
|
||||
|
||||
def getstate(self):
|
||||
state = codecs.BufferedIncrementalDecoder.getstate(self)
|
||||
# state[1] must be 0 here, as it isn't passed along to the caller
|
||||
return (state[0], self.first)
|
||||
|
||||
def setstate(self, state):
|
||||
# state[1] will be ignored by BufferedIncrementalDecoder.setstate()
|
||||
codecs.BufferedIncrementalDecoder.setstate(self, state)
|
||||
self.first = state[1]
|
||||
|
||||
class StreamWriter(codecs.StreamWriter):
|
||||
def reset(self):
|
||||
|
|
|
@ -23,7 +23,40 @@ class Queue(object):
|
|||
self._buffer = self._buffer[size:]
|
||||
return s
|
||||
|
||||
class ReadTest(unittest.TestCase):
|
||||
class MixInCheckStateHandling:
|
||||
def check_state_handling_decode(self, encoding, u, s):
|
||||
for i in xrange(len(s)+1):
|
||||
d = codecs.getincrementaldecoder(encoding)()
|
||||
part1 = d.decode(s[:i])
|
||||
state = d.getstate()
|
||||
self.assert_(isinstance(state[1], int))
|
||||
# Check that the condition stated in the documentation for
|
||||
# IncrementalDecoder.getstate() holds
|
||||
if not state[1]:
|
||||
# reset decoder to the default state without anything buffered
|
||||
d.setstate((state[0][:0], 0))
|
||||
# Feeding the previous input may not produce any output
|
||||
self.assert_(not d.decode(state[0]))
|
||||
# The decoder must return to the same state
|
||||
self.assertEqual(state, d.getstate())
|
||||
# Create a new decoder and set it to the state
|
||||
# we extracted from the old one
|
||||
d = codecs.getincrementaldecoder(encoding)()
|
||||
d.setstate(state)
|
||||
part2 = d.decode(s[i:], True)
|
||||
self.assertEqual(u, part1+part2)
|
||||
|
||||
def check_state_handling_encode(self, encoding, u, s):
|
||||
for i in xrange(len(u)+1):
|
||||
d = codecs.getincrementalencoder(encoding)()
|
||||
part1 = d.encode(u[:i])
|
||||
state = d.getstate()
|
||||
d = codecs.getincrementalencoder(encoding)()
|
||||
d.setstate(state)
|
||||
part2 = d.encode(u[i:], True)
|
||||
self.assertEqual(s, part1+part2)
|
||||
|
||||
class ReadTest(unittest.TestCase, MixInCheckStateHandling):
|
||||
def check_partial(self, input, partialresults):
|
||||
# get a StreamReader for the encoding and feed the bytestring version
|
||||
# of input to the reader byte by byte. Read every available from
|
||||
|
@ -292,7 +325,14 @@ class UTF16Test(ReadTest):
|
|||
)
|
||||
|
||||
def test_errors(self):
|
||||
self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
|
||||
self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
|
||||
"\xff", "strict", True)
|
||||
|
||||
def test_decoder_state(self):
|
||||
self.check_state_handling_decode(self.encoding,
|
||||
u"spamspam", self.spamle)
|
||||
self.check_state_handling_decode(self.encoding,
|
||||
u"spamspam", self.spambe)
|
||||
|
||||
class UTF16LETest(ReadTest):
|
||||
encoding = "utf-16-le"
|
||||
|
@ -313,7 +353,8 @@ class UTF16LETest(ReadTest):
|
|||
)
|
||||
|
||||
def test_errors(self):
|
||||
self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
|
||||
self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
|
||||
"\xff", "strict", True)
|
||||
|
||||
class UTF16BETest(ReadTest):
|
||||
encoding = "utf-16-be"
|
||||
|
@ -334,7 +375,8 @@ class UTF16BETest(ReadTest):
|
|||
)
|
||||
|
||||
def test_errors(self):
|
||||
self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
|
||||
self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
|
||||
"\xff", "strict", True)
|
||||
|
||||
class UTF8Test(ReadTest):
|
||||
encoding = "utf-8"
|
||||
|
@ -357,6 +399,11 @@ class UTF8Test(ReadTest):
|
|||
]
|
||||
)
|
||||
|
||||
def test_decoder_state(self):
|
||||
u = u"\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
|
||||
self.check_state_handling_decode(self.encoding,
|
||||
u, u.encode(self.encoding))
|
||||
|
||||
class UTF7Test(ReadTest):
|
||||
encoding = "utf-7"
|
||||
|
||||
|
@ -429,6 +476,16 @@ class UTF8SigTest(ReadTest):
|
|||
# SF bug #1601501: check that the codec works with a buffer
|
||||
unicode("\xef\xbb\xbf", "utf-8-sig")
|
||||
|
||||
def test_bom(self):
|
||||
d = codecs.getincrementaldecoder("utf-8-sig")()
|
||||
s = u"spam"
|
||||
self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
|
||||
|
||||
def test_decoder_state(self):
|
||||
u = u"\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
|
||||
self.check_state_handling_decode(self.encoding,
|
||||
u, u.encode(self.encoding))
|
||||
|
||||
class EscapeDecodeTest(unittest.TestCase):
|
||||
def test_empty(self):
|
||||
self.assertEquals(codecs.escape_decode(""), ("", 0))
|
||||
|
@ -1066,7 +1123,11 @@ broken_unicode_with_streams = [
|
|||
"punycode",
|
||||
"unicode_internal"
|
||||
]
|
||||
broken_incremental_coders = broken_unicode_with_streams[:]
|
||||
broken_incremental_coders = broken_unicode_with_streams + [
|
||||
"idna",
|
||||
"zlib_codec",
|
||||
"bz2_codec",
|
||||
]
|
||||
|
||||
# The following encodings only support "strict" mode
|
||||
only_strict_mode = [
|
||||
|
@ -1091,7 +1152,7 @@ else:
|
|||
all_unicode_encodings.append("zlib_codec")
|
||||
broken_unicode_with_streams.append("zlib_codec")
|
||||
|
||||
class BasicUnicodeTest(unittest.TestCase):
|
||||
class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
|
||||
def test_basics(self):
|
||||
s = u"abc123" # all codecs should be able to encode these
|
||||
for encoding in all_unicode_encodings:
|
||||
|
@ -1215,6 +1276,14 @@ class BasicUnicodeTest(unittest.TestCase):
|
|||
table_type = type(cp1140.encoding_table)
|
||||
self.assertEqual(table_type, table_type)
|
||||
|
||||
def test_decoder_state(self):
|
||||
# Check that getstate() and setstate() handle the state properly
|
||||
u = u"abc123"
|
||||
for encoding in all_unicode_encodings:
|
||||
if encoding not in broken_incremental_coders:
|
||||
self.check_state_handling_decode(encoding, u, u.encode(encoding))
|
||||
self.check_state_handling_encode(encoding, u, u.encode(encoding))
|
||||
|
||||
class BasicStrTest(unittest.TestCase):
|
||||
def test_basics(self):
|
||||
s = "abc123"
|
||||
|
|
Loading…
Reference in New Issue