This is r61508 plus additional fixes to the handling of 'limit'

in TextIOWrapper.readline().

All tests now pass for me (except for expected skips on darwin:
bsddb, bsddb3, cProfile, codecmaps_*, curses, gdbm, largefile,
locale, normalization, ossaudiodev, pep277, socketserver,
startfile, timeout, urllib2net, urllibnet, winreg, winsound,
xmlrpc_net, zipfile64, and the -u largefile part of test_io).
This commit is contained in:
Ka-Ping Yee 2008-03-20 10:34:07 +00:00
parent 2727503bc9
commit dbe28e573e
1 changed files with 84 additions and 64 deletions

148
Lib/io.py
View File

@ -1180,14 +1180,14 @@ class TextIOWrapper(TextIOBase):
self._encoder = None self._encoder = None
self._decoder = None self._decoder = None
self._decoded_text = "" # buffer for text produced by decoder self._decoded_text = "" # buffer for text produced by decoder
self._decoded_text_offset = 0 # offset to text returned by read()
self._snapshot = None # info for reconstructing decoder state self._snapshot = None # info for reconstructing decoder state
self._seekable = self._telling = self.buffer.seekable() self._seekable = self._telling = self.buffer.seekable()
# A word about _snapshot. This attribute is either None, or a tuple # A word about _snapshot. This attribute is either None, or a tuple
# (decoder_state, input_chunk, decoded_chars) where decoder_state is # (decoder_state, next_input) where decoder_state is the second
# the second (integer) item of the decoder state, input_chunk is the # (integer) item of the decoder state, and next_input is the chunk
# chunk of bytes that was read, and decoded_chars is the number of # of bytes that comes after the snapshot point in the input.
# characters rendered by the decoder after feeding it those bytes.
# We use this to reconstruct intermediate decoder states in tell(). # We use this to reconstruct intermediate decoder states in tell().
# Naming convention: # Naming convention:
@ -1271,10 +1271,10 @@ class TextIOWrapper(TextIOBase):
""" """
Read and decode the next chunk of data from the BufferedReader. Read and decode the next chunk of data from the BufferedReader.
Return a tuple of two elements: all the bytes that were read, and The return value is True unless EOF was reached. The decoded string
the decoded string produced by the decoder. (The entire input is placed in self._decoded_text (replacing its previous value).
chunk is sent to the decoder, but some of it may remain buffered (The entire input chunk is sent to the decoder, though some of it
in the decoder, yet to be converted.) may remain buffered in the decoder, yet to be converted.)
""" """
if self._decoder is None: if self._decoder is None:
@ -1283,8 +1283,9 @@ class TextIOWrapper(TextIOBase):
# No one should call tell(), so don't bother taking a snapshot. # No one should call tell(), so don't bother taking a snapshot.
input_chunk = self.buffer.read1(self._CHUNK_SIZE) input_chunk = self.buffer.read1(self._CHUNK_SIZE)
eof = not input_chunk eof = not input_chunk
decoded = self._decoder.decode(input_chunk, eof) self._decoded_text = self._decoder.decode(input_chunk, eof)
return (input_chunk, decoded) self._decoded_text_offset = 0
return not eof
# The cookie returned by tell() cannot include the contents of # The cookie returned by tell() cannot include the contents of
# the decoder's buffer, so we need to snapshot a point in the # the decoder's buffer, so we need to snapshot a point in the
@ -1298,16 +1299,15 @@ class TextIOWrapper(TextIOBase):
input_chunk = self.buffer.read1(self._CHUNK_SIZE) input_chunk = self.buffer.read1(self._CHUNK_SIZE)
eof = not input_chunk eof = not input_chunk
decoded = self._decoder.decode(input_chunk, eof) self._decoded_text = self._decoder.decode(input_chunk, eof)
self._decoded_text_offset = 0
# At the snapshot point len(dec_buffer) bytes ago, the next input # At the snapshot point, len(dec_buffer) bytes ago, the next input
# to be passed to the decoder is dec_buffer + input_chunk. Save # to be passed to the decoder is dec_buffer + input_chunk.
# len(decoded) so that later, tell() can figure out how much self._snapshot = (dec_flags, dec_buffer + input_chunk)
# decoded data has been used up by TextIOWrapper.read(). return not eof
self._snapshot = (dec_flags, dec_buffer + input_chunk, len(decoded))
return (input_chunk, decoded)
def _encode_tell_cookie(self, position, dec_flags=0, def _pack_cookie(self, position, dec_flags=0,
feed_bytes=0, need_eof=0, skip_chars=0): feed_bytes=0, need_eof=0, skip_chars=0):
# The meaning of a tell() cookie is: seek to position, set the # The meaning of a tell() cookie is: seek to position, set the
# decoder flags to dec_flags, read feed_bytes bytes, feed them # decoder flags to dec_flags, read feed_bytes bytes, feed them
@ -1317,7 +1317,7 @@ class TextIOWrapper(TextIOBase):
return (position | (dec_flags<<64) | (feed_bytes<<128) | return (position | (dec_flags<<64) | (feed_bytes<<128) |
(skip_chars<<192) | bool(need_eof)<<256) (skip_chars<<192) | bool(need_eof)<<256)
def _decode_tell_cookie(self, bigint): def _unpack_cookie(self, bigint):
rest, position = divmod(bigint, 1<<64) rest, position = divmod(bigint, 1<<64)
rest, dec_flags = divmod(rest, 1<<64) rest, dec_flags = divmod(rest, 1<<64)
rest, feed_bytes = divmod(rest, 1<<64) rest, feed_bytes = divmod(rest, 1<<64)
@ -1339,14 +1339,14 @@ class TextIOWrapper(TextIOBase):
return position return position
# Skip backward to the snapshot point (see _read_chunk). # Skip backward to the snapshot point (see _read_chunk).
dec_flags, next_input, decoded_chars = self._snapshot dec_flags, next_input = self._snapshot
position -= len(next_input) position -= len(next_input)
# How many decoded characters have been consumed since the snapshot? # How many decoded characters have been returned since the snapshot?
skip_chars = decoded_chars - len(self._decoded_text) skip_chars = self._decoded_text_offset
if skip_chars == 0: if skip_chars == 0:
# We haven't moved from the snapshot point. # We haven't moved from the snapshot point.
return self._encode_tell_cookie(position, dec_flags) return self._pack_cookie(position, dec_flags)
# Walk the decoder forward, one byte at a time, to find the minimum # Walk the decoder forward, one byte at a time, to find the minimum
# input necessary to give us the decoded characters we need to skip. # input necessary to give us the decoded characters we need to skip.
@ -1373,8 +1373,8 @@ class TextIOWrapper(TextIOBase):
if decoded_chars >= skip_chars: if decoded_chars >= skip_chars:
break break
else: else:
# We didn't get enough decoded data; send EOF to get more. # We didn't get enough decoded data; signal EOF to get more.
decoded = decoder.decode(b"", True) decoded = decoder.decode(b"", final=True)
decoded_chars += len(decoded) decoded_chars += len(decoded)
need_eof = 1 need_eof = 1
if decoded_chars < skip_chars: if decoded_chars < skip_chars:
@ -1385,7 +1385,7 @@ class TextIOWrapper(TextIOBase):
position += safe_fed_bytes position += safe_fed_bytes
fed_bytes -= safe_fed_bytes fed_bytes -= safe_fed_bytes
skip_chars -= safe_decoded_chars skip_chars -= safe_decoded_chars
return self._encode_tell_cookie( return self._pack_cookie(
position, dec_flags, fed_bytes, need_eof, skip_chars) position, dec_flags, fed_bytes, need_eof, skip_chars)
finally: finally:
decoder.setstate(saved_state) decoder.setstate(saved_state)
@ -1405,8 +1405,7 @@ class TextIOWrapper(TextIOBase):
raise IOError("can't do nonzero end-relative seeks") raise IOError("can't do nonzero end-relative seeks")
self.flush() self.flush()
position = self.buffer.seek(0, 2) position = self.buffer.seek(0, 2)
self._decoded_text = "" self._clear_decoded_text()
self._snapshot = None
if self._decoder: if self._decoder:
self._decoder.reset() self._decoder.reset()
return position return position
@ -1419,48 +1418,70 @@ class TextIOWrapper(TextIOBase):
# Seek back to the snapshot point. # Seek back to the snapshot point.
position, dec_flags, feed_bytes, need_eof, skip_chars = \ position, dec_flags, feed_bytes, need_eof, skip_chars = \
self._decode_tell_cookie(cookie) self._unpack_cookie(cookie)
self.buffer.seek(position) self.buffer.seek(position)
self._decoded_text = "" self._clear_decoded_text()
self._snapshot = None
if self._decoder or dec_flags or feed_bytes or need_eof: if self._decoder or dec_flags or feed_bytes or need_eof:
# Restore the decoder flags to their values from the snapshot. # Restore the decoder flags to their values from the snapshot.
self._decoder = self._decoder or self._get_decoder() self._decoder = self._decoder or self._get_decoder()
self._decoder.setstate((b"", dec_flags)) self._decoder.setstate((b"", dec_flags))
self._snapshot = (dec_flags, b'')
if feed_bytes or need_eof: if feed_bytes or need_eof:
# Feed feed_bytes bytes to the decoder. # Feed feed_bytes bytes to the decoder.
input_chunk = self.buffer.read(feed_bytes) input_chunk = self.buffer.read(feed_bytes)
decoded = self._decoder.decode(input_chunk, need_eof) self._decoded_text = self._decoder.decode(input_chunk, need_eof)
if len(decoded) < skip_chars: if len(self._decoded_text) < skip_chars:
raise IOError("can't restore logical file position") raise IOError("can't restore logical file position")
# Skip skip_chars of the decoded characters. # Skip skip_chars of the decoded characters.
self._decoded_text = decoded[skip_chars:] self._decoded_text_offset = skip_chars
# Restore the snapshot. # Restore the snapshot.
self._snapshot = (dec_flags, input_chunk, len(decoded)) self._snapshot = (dec_flags, input_chunk)
return cookie return cookie
def _clear_decoded_text(self):
"""Reset the _decoded_text buffer."""
self._decoded_text = ''
self._decoded_text_offset = 0
self._snapshot = None
def _emit_decoded_text(self, n=None):
"""Advance into the _decoded_text buffer."""
offset = self._decoded_text_offset
if n is None:
text = self._decoded_text[offset:]
else:
text = self._decoded_text[offset:offset + n]
self._decoded_text_offset += len(text)
return text
def _unemit_decoded_text(self, n):
"""Rewind the _decoded_text buffer."""
if self._decoded_text_offset < n:
raise AssertionError("unemit out of bounds")
self._decoded_text_offset -= n
def read(self, n=None): def read(self, n=None):
if n is None: if n is None:
n = -1 n = -1
decoder = self._decoder or self._get_decoder() decoder = self._decoder or self._get_decoder()
result = self._decoded_text
if n < 0: if n < 0:
result += decoder.decode(self.buffer.read(), True) # Read everything.
self._decoded_text = "" result = (self._emit_decoded_text() +
self._snapshot = None decoder.decode(self.buffer.read(), final=True))
self._clear_decoded_text()
return result return result
else: else:
while len(result) < n: # Keep reading chunks until we have n characters to return.
input_chunk, decoded = self._read_chunk() eof = False
result += decoded result = self._emit_decoded_text(n)
if not input_chunk: while len(result) < n and not eof:
break eof = not self._read_chunk()
self._decoded_text = result[n:] result += self._emit_decoded_text(n - len(result))
return result[:n] return result
def __next__(self): def __next__(self):
self._telling = False self._telling = False
@ -1474,16 +1495,10 @@ class TextIOWrapper(TextIOBase):
def readline(self, limit=None): def readline(self, limit=None):
if limit is None: if limit is None:
limit = -1 limit = -1
if limit >= 0:
# XXX Hack to support limit argument, for backwards compatibility
line = self.readline()
if len(line) <= limit:
return line
line, self._decoded_text = \
line[:limit], line[limit:] + self._decoded_text
return line
line = self._decoded_text # Grab all the decoded text (we will rewind any extra bits later).
line = self._emit_decoded_text()
start = 0 start = 0
decoder = self._decoder or self._get_decoder() decoder = self._decoder or self._get_decoder()
@ -1536,22 +1551,27 @@ class TextIOWrapper(TextIOBase):
endpos = pos + len(self._readnl) endpos = pos + len(self._readnl)
break break
if limit >= 0 and len(line) >= limit:
endpos = limit # reached length limit
break
# No line ending seen yet - get more data # No line ending seen yet - get more data
more_line = '' more_line = ''
while True: while self._read_chunk():
readahead, pending = self._read_chunk() if self._decoded_text:
more_line = pending
if more_line or not readahead:
break break
if more_line: if self._decoded_text:
line += more_line line += self._emit_decoded_text()
else: else:
# end of file # end of file
self._decoded_text = '' self._clear_decoded_text()
self._snapshot = None
return line return line
self._decoded_text = line[endpos:] if limit >= 0 and endpos > limit:
endpos = limit # don't exceed limit
# Rewind _decoded_text to just after the line ending we found.
self._unemit_decoded_text(len(line) - endpos)
return line[:endpos] return line[:endpos]
@property @property