Instead of pickling the whole decoder, use the new getstate/setstate API.

This commit is contained in:
Guido van Rossum 2007-04-17 02:38:04 +00:00
parent 3abcb013b8
commit d76e7796c9
2 changed files with 57 additions and 33 deletions

View File

@ -18,7 +18,7 @@ XXX don't use assert to validate input requirements
XXX whenever an argument is None, use the default value
XXX read/write ops should check readable/writable
XXX buffered readinto should work with arbitrary buffer objects
XXX use incremental encoder for text output, at least for UTF-16
XXX use incremental encoder for text output, at least for UTF-16 and UTF-8-SIG
"""
__author__ = ("Guido van Rossum <guido@python.org>, "
@ -36,11 +36,6 @@ import codecs
import _fileio
import warnings
try:
import cPickle as pickle
except ImportError:
import pickle
# XXX Shouldn't we use st_blksize whenever we can?
DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes
@ -957,17 +952,16 @@ class TextIOWrapper(TextIOBase):
self._newline = newline or os.linesep
self._fix_newlines = newline is None
self._decoder = None
self._decoder_in_rest_pickle = None
self._pending = ""
self._snapshot = None
self._seekable = self._telling = self.buffer.seekable()
# A word about _snapshot. This attribute is either None, or a
# tuple (decoder_pickle, readahead, pending) where decoder_pickle
# is a pickled decoder state, readahead is the chunk of bytes that
# was read, and pending is the characters that were rendered by
# the decoder after feeding it those bytes. We use this to
# reconstruct intermediate decoder states in tell().
# tuple (decoder_state, readahead, pending) where decoder_state is
# the second (integer) item of the decoder state, readahead is the
# chunk of bytes that was read, and pending is the characters that
# were rendered by the decoder after feeding it those bytes. We
# use this to reconstruct intermediate decoder states in tell().
def _seekable(self):
return self._seekable
@ -1005,10 +999,6 @@ class TextIOWrapper(TextIOBase):
raise IOError("Can't find an incremental decoder for encoding %s" %
self._encoding)
decoder = self._decoder = make_decoder() # XXX: errors
if isinstance(decoder, codecs.BufferedIncrementalDecoder):
# XXX Hack: make the codec use bytes instead of strings
decoder.buffer = b""
self._decoder_in_rest_pickle = pickle.dumps(decoder, 2) # For tell()
return decoder
def _read_chunk(self):
@ -1017,15 +1007,13 @@ class TextIOWrapper(TextIOBase):
readahead = self.buffer.read1(self._CHUNK_SIZE)
pending = self._decoder.decode(readahead, not readahead)
return readahead, pending
decoder_state = pickle.dumps(self._decoder, 2)
decoder_buffer, decoder_state = self._decoder.getstate()
readahead = self.buffer.read1(self._CHUNK_SIZE)
pending = self._decoder.decode(readahead, not readahead)
self._snapshot = (decoder_state, readahead, pending)
self._snapshot = (decoder_state, decoder_buffer + readahead, pending)
return readahead, pending
def _encode_decoder_state(self, ds, pos):
if ds == self._decoder_in_rest_pickle:
return pos
x = 0
for i in bytes(ds):
x = x<<8 | i
@ -1048,7 +1036,8 @@ class TextIOWrapper(TextIOBase):
raise IOError("Telling position disabled by next() call")
self.flush()
position = self.buffer.tell()
if self._decoder is None or self._snapshot is None:
decoder = self._decoder
if decoder is None or self._snapshot is None:
assert self._pending == ""
return position
decoder_state, readahead, pending = self._snapshot
@ -1056,15 +1045,21 @@ class TextIOWrapper(TextIOBase):
needed = len(pending) - len(self._pending)
if not needed:
return self._encode_decoder_state(decoder_state, position)
decoder = pickle.loads(decoder_state)
n = 0
bb = bytes(1)
for i, bb[0] in enumerate(readahead):
n += len(decoder.decode(bb))
if n >= needed:
decoder_state = pickle.dumps(decoder, 2)
return self._encode_decoder_state(decoder_state, position+i+1)
raise IOError("Can't reconstruct logical file position")
saved_state = decoder.getstate()
try:
decoder.setstate(("", decoder_state))
n = 0
bb = bytes(1)
for i, bb[0] in enumerate(readahead):
n += len(decoder.decode(bb))
if n >= needed:
decoder_buffer, decoder_state = decoder.getstate()
return self._encode_decoder_state(
decoder_state,
position + (i+1) - len(decoder_buffer))
raise IOError("Can't reconstruct logical file position")
finally:
decoder.setstate(saved_state)
def seek(self, pos, whence=0):
if not self._seekable:
@ -1097,12 +1092,11 @@ class TextIOWrapper(TextIOBase):
self._pending = ""
self._decoder = None
return pos
decoder = pickle.loads(ds)
decoder = self._decoder or self._get_decoder()
decoder.set_state(("", ds))
self.buffer.seek(pos)
self._snapshot = (ds, b"", "")
self._pending = ""
if not self._decoder_in_rest_pickle:
self._get_decoder() # For its side effect
self._decoder = decoder
return orig_pos

View File

@ -581,6 +581,36 @@ class TextIOWrapperTest(unittest.TestCase):
self.assertEquals(f.tell(), p2)
f.close()
def testSeeking(self):
chunk_size = io.TextIOWrapper._CHUNK_SIZE
prefix_size = chunk_size - 2
u_prefix = u"a" * prefix_size
prefix = bytes(u_prefix.encode("utf-8"))
self.assertEquals(len(u_prefix), len(prefix))
u_suffix = u"\u8888\n"
suffix = bytes(u_suffix.encode("utf-8"))
line = prefix + suffix
f = io.open(test_support.TESTFN, "wb")
f.write(line*2)
f.close()
f = io.open(test_support.TESTFN, "r", encoding="utf-8")
s = f.read(prefix_size)
self.assertEquals(s, prefix)
self.assertEquals(f.tell(), prefix_size)
self.assertEquals(f.readline(), u_suffix)
def testSeekingToo(self):
# Regression test for a specific bug
data = b'\xe0\xbf\xbf\n'
f = io.open(test_support.TESTFN, "wb")
f.write(data)
f.close()
f = io.open(test_support.TESTFN, "r", encoding="utf-8")
f._CHUNK_SIZE # Just test that it exists
f._CHUNK_SIZE = 2
f.readline()
f.tell()
def timingTest(self):
timer = time.time
enc = "utf8"