Added a working Text I/O layer, by Mark Russell.

This is essentially a checkpoint.
This commit is contained in:
Guido van Rossum 2007-04-06 17:31:18 +00:00
parent 0e074483e7
commit 78892e4613
2 changed files with 366 additions and 47 deletions

281
Lib/io.py
View File

@ -3,7 +3,7 @@
This is an early prototype; eventually some of this will be This is an early prototype; eventually some of this will be
reimplemented in C and the rest may be turned into a package. reimplemented in C and the rest may be turned into a package.
See PEP XXX; for now: http://docs.google.com/Doc?id=dfksfvqd_1cn5g5m See PEP 3116.
XXX need to default buffer size to 1 if isatty() XXX need to default buffer size to 1 if isatty()
XXX need to support 1 meaning line-buffered XXX need to support 1 meaning line-buffered
@ -11,20 +11,24 @@ XXX change behavior of blocking I/O
""" """
__author__ = ("Guido van Rossum <guido@python.org>, " __author__ = ("Guido van Rossum <guido@python.org>, "
"Mike Verdone <mike.verdone@gmail.com>") "Mike Verdone <mike.verdone@gmail.com>, "
"Mark Russell <mark.russell@zen.co.uk>")
__all__ = ["open", "RawIOBase", "FileIO", "SocketIO", "BytesIO", __all__ = ["open", "RawIOBase", "FileIO", "SocketIO", "BytesIO",
"BufferedReader", "BufferedWriter", "BufferedRWPair", "BufferedReader", "BufferedWriter", "BufferedRWPair",
"BufferedRandom", "EOF"] "BufferedRandom"]
import os import os
import sys
import codecs
import warnings
DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes
DEFAULT_MAX_BUFFER_SIZE = 16 * 1024 # bytes DEFAULT_MAX_BUFFER_SIZE = 16 * 1024 # bytes
EOF = b'' # XXX This is wrong because it's mutable
class BlockingIO(IOError): class BlockingIO(IOError):
def __init__(self, errno, strerror, characters_written): def __init__(self, errno, strerror, characters_written):
IOError.__init__(self, errno, strerror) IOError.__init__(self, errno, strerror)
self.characters_written = characters_written self.characters_written = characters_written
@ -125,14 +129,12 @@ class RawIOBase:
seeked. seeked.
The read() method is implemented by calling readinto(); derived The read() method is implemented by calling readinto(); derived
classes that want to support readon only need to implement classes that want to support read() only need to implement
readinto() as a primitive operation. readinto() as a primitive operation.
""" """
# XXX Add individual method docstrings
def read(self, n): def read(self, n):
"""Read and return up to n bytes. """read(n: int) -> bytes. Read and return up to n bytes.
Returns an empty bytes array on EOF, or None if the object is Returns an empty bytes array on EOF, or None if the object is
set not to block and has no data to read. set not to block and has no data to read.
@ -143,43 +145,80 @@ class RawIOBase:
return b return b
def readinto(self, b): def readinto(self, b):
"""readinto(b: bytes) -> None. Read up to len(b) bytes into b.
Returns number of bytes read (0 for EOF), or None if the object
is set not to block as has no data to read.
"""
raise IOError(".readinto() not supported") raise IOError(".readinto() not supported")
def write(self, b): def write(self, b):
"""Write the given buffer to the IO stream. """write(b: bytes) -> int. Write the given buffer to the IO stream.
Returns the number of bytes written. Returns the number of bytes written, which may be less than len(b).
""" """
raise IOError(".write() not supported") raise IOError(".write() not supported")
def seek(self, pos, whence=0): def seek(self, pos, whence=0):
"""seek(pos: int, whence: int = 0) -> None. Change stream position.
Seek to byte offset pos relative to position indicated by whence:
0 Start of stream (the default). pos should be >= 0;
1 Current position - whence may be negative;
2 End of stream - whence usually negative.
"""
raise IOError(".seek() not supported") raise IOError(".seek() not supported")
def tell(self): def tell(self):
"""tell() -> int. Return current stream position."""
raise IOError(".tell() not supported") raise IOError(".tell() not supported")
def truncate(self, pos=None): def truncate(self, pos=None):
"""truncate(size: int = None) -> None. Truncate file to size bytes.
Size defaults to the current IO position as reported by tell().
"""
raise IOError(".truncate() not supported") raise IOError(".truncate() not supported")
def close(self): def close(self):
"""close() -> None. Close IO object."""
pass pass
def seekable(self): def seekable(self):
"""seekable() -> bool. Return whether object supports random access.
If False, seek(), tell() and truncate() will raise IOError.
This method may need to do a test seek().
"""
return False return False
def readable(self): def readable(self):
"""readable() -> bool. Return whether object was opened for reading.
If False, read() will raise IOError.
"""
return False return False
def writable(self): def writable(self):
"""writable() -> bool. Return whether object was opened for writing.
If False, write() and truncate() will raise IOError.
"""
return False return False
def __enter__(self): def __enter__(self):
"""Context management protocol. Returns self."""
return self return self
def __exit__(self, *args): def __exit__(self, *args):
"""Context management protocol. Same as close()"""
self.close() self.close()
def fileno(self): def fileno(self):
"""fileno() -> int. Return underlying file descriptor if there is one.
Raises IOError if the IO object does not use a file descriptor.
"""
raise IOError(".fileno() not supported") raise IOError(".fileno() not supported")
@ -252,6 +291,8 @@ try:
import _fileio import _fileio
except ImportError: except ImportError:
# Let's use the Python version # Let's use the Python version
warnings.warn("Can't import _fileio, using slower Python lookalike",
RuntimeWarning)
FileIO = _PyFileIO FileIO = _PyFileIO
else: else:
# Create a trivial subclass with the proper inheritance structure # Create a trivial subclass with the proper inheritance structure
@ -295,17 +336,13 @@ class BufferedIOBase(RawIOBase):
"""XXX Docstring.""" """XXX Docstring."""
class BytesIO(BufferedIOBase): class _MemoryBufferMixin:
"""Buffered I/O implementation using a bytes buffer, like StringIO.""" # XXX docstring
# XXX More docs def __init__(self, buffer):
self._buffer = buffer
def __init__(self, inital_bytes=None):
self._buffer = b""
self._pos = 0 self._pos = 0
if inital_bytes is not None:
self._buffer += inital_bytes
def getvalue(self): def getvalue(self):
return self._buffer return self._buffer
@ -362,6 +399,35 @@ class BytesIO(BufferedIOBase):
return True return True
class BytesIO(_MemoryBufferMixin, BufferedIOBase):
"""Buffered I/O implementation using a bytes buffer, like StringIO."""
# XXX More docs
def __init__(self, inital_bytes=None):
buffer = b""
if inital_bytes is not None:
buffer += inital_bytes
_MemoryBufferMixin.__init__(self, buffer)
class StringIO(_MemoryBufferMixin, BufferedIOBase):
"""Buffered I/O implementation using a string buffer, like StringIO."""
# XXX More docs
# XXX Reuses the same code as BytesIO, just with a string rather
# that bytes as the _buffer value. That won't work in C of course.
def __init__(self, inital_string=None):
buffer = ""
if inital_string is not None:
buffer += inital_string
_MemoryBufferMixin.__init__(self, buffer)
class BufferedIOBase(RawIOBase): class BufferedIOBase(RawIOBase):
"""Base class for buffered IO objects.""" """Base class for buffered IO objects."""
@ -375,15 +441,17 @@ class BufferedReader(BufferedIOBase):
"""Buffer for a readable sequential RawIO object. """Buffer for a readable sequential RawIO object.
Does not allow random access (seek, tell). Does not allow random access (seek, tell). (Use BufferedRandom
for that.)
""" """
def __init__(self, raw, unused_buffer_size=None): def __init__(self, raw, buffer_size=DEFAULT_BUFFER_SIZE):
"""Create a new buffered reader using the given readable raw IO object. """Create a new buffered reader using the given readable raw IO object.
""" """
assert raw.readable() assert raw.readable()
self.raw = raw self.raw = raw
self._read_buf = b"" self._read_buf = b""
self.buffer_size = buffer_size
if hasattr(raw, 'fileno'): if hasattr(raw, 'fileno'):
self.fileno = raw.fileno self.fileno = raw.fileno
@ -395,11 +463,13 @@ class BufferedReader(BufferedIOBase):
mode. If n is None, read until EOF or until read() would mode. If n is None, read until EOF or until read() would
block. block.
""" """
assert n is None or n > 0 assert n is None or n > 0, '.read(): Bad read size %r' % n
nodata_val = EOF nodata_val = b""
while (len(self._read_buf) < n) if (n is not None) else True: while n is None or len(self._read_buf) < n:
current = self.raw.read(n) to_read = None if n is None else max(n, self.buffer_size)
if current in (EOF, None): current = self.raw.read(to_read)
if current in (b"", None):
nodata_val = current nodata_val = current
break break
self._read_buf += current self._read_buf += current
@ -428,6 +498,8 @@ class BufferedReader(BufferedIOBase):
class BufferedWriter(BufferedIOBase): class BufferedWriter(BufferedIOBase):
# XXX docstring
def __init__(self, raw, buffer_size=DEFAULT_BUFFER_SIZE, def __init__(self, raw, buffer_size=DEFAULT_BUFFER_SIZE,
max_buffer_size=DEFAULT_MAX_BUFFER_SIZE): max_buffer_size=DEFAULT_MAX_BUFFER_SIZE):
assert raw.writable() assert raw.writable()
@ -488,6 +560,8 @@ class BufferedRWPair(BufferedReader, BufferedWriter):
A buffered reader object and buffered writer object put together to A buffered reader object and buffered writer object put together to
form a sequential IO object that can read and write. form a sequential IO object that can read and write.
This is typically used with a socket or two-way pipe.
""" """
def __init__(self, reader, writer, buffer_size=DEFAULT_BUFFER_SIZE, def __init__(self, reader, writer, buffer_size=DEFAULT_BUFFER_SIZE,
@ -528,6 +602,8 @@ class BufferedRWPair(BufferedReader, BufferedWriter):
class BufferedRandom(BufferedReader, BufferedWriter): class BufferedRandom(BufferedReader, BufferedWriter):
# XXX docstring
def __init__(self, raw, buffer_size=DEFAULT_BUFFER_SIZE, def __init__(self, raw, buffer_size=DEFAULT_BUFFER_SIZE,
max_buffer_size=DEFAULT_MAX_BUFFER_SIZE): max_buffer_size=DEFAULT_MAX_BUFFER_SIZE):
assert raw.seekable() assert raw.seekable()
@ -561,6 +637,8 @@ class BufferedRandom(BufferedReader, BufferedWriter):
return BufferedReader.read(self, n) return BufferedReader.read(self, n)
def write(self, b): def write(self, b):
if self._read_buf:
self.raw.seek(-len(self._read_buf), 1) # Undo readahead
self._read_buf = b"" self._read_buf = b""
return BufferedWriter.write(self, b) return BufferedWriter.write(self, b)
@ -569,3 +647,156 @@ class BufferedRandom(BufferedReader, BufferedWriter):
def close(self): def close(self):
self.raw.close() self.raw.close()
class TextIOBase(BufferedIOBase):
"""Base class for text I/O.
This class provides a character and line based interface to stream I/O.
"""
def read(self, n: int = -1) -> str:
"""read(n: int = -1) -> str. Read at most n characters from stream.
Read from underlying buffer until we have n characters or we hit EOF.
If n is negative or omitted, read until EOF.
"""
raise IOError(".read() not supported")
def write(self, s: str):
"""write(s: str) -> None. Write string s to stream.
"""
raise IOError(".write() not supported")
def readline(self) -> str:
"""readline() -> str. Read until newline or EOF.
Returns an empty string if EOF is hit immediately.
"""
raise IOError(".readline() not supported")
def __iter__(self):
"""__iter__() -> Iterator. Return line iterator (actually just self).
"""
return self
def next(self):
"""Same as readline() except raises StopIteration on immediate EOF.
"""
line = self.readline()
if line == '':
raise StopIteration
return line
class TextIOWrapper(TextIOBase):
"""Buffered text stream.
Character and line based layer over a BufferedIOBase object.
"""
# XXX tell(), seek()
def __init__(self, buffer, encoding=None, newline=None):
if newline not in (None, '\n', '\r\n'):
raise IOError("illegal newline %s" % newline) # XXX: ValueError?
if encoding is None:
# XXX This is questionable
encoding = sys.getfilesystemencoding()
if encoding is None:
encoding = "latin-1" # XXX, but this is best for transparancy
self.buffer = buffer
self._encoding = encoding
self._newline = newline or os.linesep
self._fix_newlines = newline is None
self._decoder = None
self._pending = ''
def write(self, s: str):
return self.buffer.write(s.encode(self._encoding))
def _get_decoder(self):
make_decoder = codecs.getincrementaldecoder(self._encoding)
if make_decoder is None:
raise IOError(".readline() not supported for encoding %s" %
self._encoding)
decoder = self._decoder = make_decoder() # XXX: errors
if isinstance(decoder, codecs.BufferedIncrementalDecoder):
# XXX Hack: make the codec use bytes instead of strings
decoder.buffer = b""
return decoder
def read(self, n: int = -1):
decoder = self._decoder or self._get_decoder()
res = self._pending
if n < 0:
res += decoder.decode(self.buffer.read(), True)
self._pending = ''
return res
else:
while len(res) < n:
data = self.buffer.read(64)
res += decoder.decode(data, not data)
if not data:
break
self._pending = res[n:]
return res[:n]
def readline(self):
line = self._pending
start = 0
decoder = self._decoder or self._get_decoder()
while True:
# In C we'd look for these in parallel of course.
nlpos = line.find("\n", start)
crpos = line.find("\r", start)
if nlpos >= 0 and crpos >= 0:
endpos = min(nlpos, crpos)
else:
endpos = nlpos if nlpos >= 0 else crpos
if endpos != -1:
endc = line[endpos]
if endc == "\n":
ending = "\n"
break
# We've seen \r - is it standalone, \r\n or \r at end of line?
if endpos + 1 < len(line):
if line[endpos+1] == '\n':
ending = "\r\n"
else:
ending = "\r"
break
# There might be a following \n in the next block of data ...
start = endpos
else:
start = len(line)
# No line ending seen yet - get more data
while True:
data = self.buffer.read(64)
more_line = decoder.decode(data, not data)
if more_line != "" or not data:
break
if more_line == "":
ending = ''
endpos = len(line)
break
line += more_line
nextpos = endpos + len(ending)
self._pending = line[nextpos:]
# XXX Update self.newlines here if we want to support that
if self._fix_newlines and ending != "\n" and ending != '':
return line[:endpos] + "\n"
else:
return line[:nextpos]

View File

@ -2,7 +2,7 @@
import unittest import unittest
from test import test_support from test import test_support
from itertools import chain
import io import io
@ -16,7 +16,7 @@ class MockIO(io.RawIOBase):
try: try:
return self._readStack.pop(0) return self._readStack.pop(0)
except: except:
return io.EOF return b""
def write(self, b): def write(self, b):
self._writeStack.append(b) self._writeStack.append(b)
@ -41,6 +41,18 @@ class MockIO(io.RawIOBase):
return 42 return 42
class MockFileIO(io.BytesIO):
def __init__(self, data):
self.read_history = []
io.BytesIO.__init__(self, data)
def read(self, n=None):
res = io.BytesIO.read(self, n)
self.read_history.append(None if res is None else len(res))
return res
class MockNonBlockWriterIO(io.RawIOBase): class MockNonBlockWriterIO(io.RawIOBase):
def __init__(self, blockingScript): def __init__(self, blockingScript):
@ -147,31 +159,31 @@ class IOTest(unittest.TestCase):
f.close() f.close()
class BytesIOTest(unittest.TestCase): class MemorySeekTest(unittest.TestCase):
def testInit(self): def testInit(self):
buf = b"1234567890" buf = self.buftype("1234567890")
bytesIo = io.BytesIO(buf) bytesIo = self.ioclass(buf)
def testRead(self): def testRead(self):
buf = b"1234567890" buf = self.buftype("1234567890")
bytesIo = io.BytesIO(buf) bytesIo = self.ioclass(buf)
self.assertEquals(buf[:1], bytesIo.read(1)) self.assertEquals(buf[:1], bytesIo.read(1))
self.assertEquals(buf[1:5], bytesIo.read(4)) self.assertEquals(buf[1:5], bytesIo.read(4))
self.assertEquals(buf[5:], bytesIo.read(900)) self.assertEquals(buf[5:], bytesIo.read(900))
self.assertEquals(io.EOF, bytesIo.read()) self.assertEquals(self.EOF, bytesIo.read())
def testReadNoArgs(self): def testReadNoArgs(self):
buf = b"1234567890" buf = self.buftype("1234567890")
bytesIo = io.BytesIO(buf) bytesIo = self.ioclass(buf)
self.assertEquals(buf, bytesIo.read()) self.assertEquals(buf, bytesIo.read())
self.assertEquals(io.EOF, bytesIo.read()) self.assertEquals(self.EOF, bytesIo.read())
def testSeek(self): def testSeek(self):
buf = b"1234567890" buf = self.buftype("1234567890")
bytesIo = io.BytesIO(buf) bytesIo = self.ioclass(buf)
bytesIo.read(5) bytesIo.read(5)
bytesIo.seek(0) bytesIo.seek(0)
@ -181,8 +193,8 @@ class BytesIOTest(unittest.TestCase):
self.assertEquals(buf[3:], bytesIo.read()) self.assertEquals(buf[3:], bytesIo.read())
def testTell(self): def testTell(self):
buf = b"1234567890" buf = self.buftype("1234567890")
bytesIo = io.BytesIO(buf) bytesIo = self.ioclass(buf)
self.assertEquals(0, bytesIo.tell()) self.assertEquals(0, bytesIo.tell())
bytesIo.seek(5) bytesIo.seek(5)
@ -191,6 +203,18 @@ class BytesIOTest(unittest.TestCase):
self.assertEquals(10000, bytesIo.tell()) self.assertEquals(10000, bytesIo.tell())
class BytesIOTest(MemorySeekTest):
buftype = bytes
ioclass = io.BytesIO
EOF = b""
class StringIOTest(MemorySeekTest):
buftype = str
ioclass = io.StringIO
EOF = ""
class BufferedReaderTest(unittest.TestCase): class BufferedReaderTest(unittest.TestCase):
def testRead(self): def testRead(self):
@ -199,6 +223,25 @@ class BufferedReaderTest(unittest.TestCase):
self.assertEquals(b"abcdef", bufIo.read(6)) self.assertEquals(b"abcdef", bufIo.read(6))
def testBuffering(self):
data = b"abcdefghi"
dlen = len(data)
tests = [
[ 100, [ 3, 1, 4, 8 ], [ dlen, 0 ] ],
[ 100, [ 3, 3, 3], [ dlen ] ],
[ 4, [ 1, 2, 4, 2 ], [ 4, 4, 1 ] ],
]
for bufsize, buf_read_sizes, raw_read_sizes in tests:
rawIo = MockFileIO(data)
bufIo = io.BufferedReader(rawIo, buffer_size=bufsize)
pos = 0
for nbytes in buf_read_sizes:
self.assertEquals(bufIo.read(nbytes), data[pos:pos+nbytes])
pos += nbytes
self.assertEquals(rawIo.read_history, raw_read_sizes)
def testReadNonBlocking(self): def testReadNonBlocking(self):
# Inject some None's in there to simulate EWOULDBLOCK # Inject some None's in there to simulate EWOULDBLOCK
rawIo = MockIO((b"abc", b"d", None, b"efg", None, None)) rawIo = MockIO((b"abc", b"d", None, b"efg", None, None))
@ -208,7 +251,7 @@ class BufferedReaderTest(unittest.TestCase):
self.assertEquals(b"e", bufIo.read(1)) self.assertEquals(b"e", bufIo.read(1))
self.assertEquals(b"fg", bufIo.read()) self.assertEquals(b"fg", bufIo.read())
self.assert_(None is bufIo.read()) self.assert_(None is bufIo.read())
self.assertEquals(io.EOF, bufIo.read()) self.assertEquals(b"", bufIo.read())
def testReadToEof(self): def testReadToEof(self):
rawIo = MockIO((b"abc", b"d", b"efg")) rawIo = MockIO((b"abc", b"d", b"efg"))
@ -270,8 +313,9 @@ class BufferedWriterTest(unittest.TestCase):
bufIo.write(b"asdfasdfasdf") bufIo.write(b"asdfasdfasdf")
# XXX I don't like this test. It relies too heavily on how the algorithm # XXX I don't like this test. It relies too heavily on how the
# actually works, which we might change. Refactor later. # algorithm actually works, which we might change. Refactor
# later.
def testFileno(self): def testFileno(self):
rawIo = MockIO((b"abc", b"d", b"efg")) rawIo = MockIO((b"abc", b"d", b"efg"))
@ -299,7 +343,7 @@ class BufferedRWPairTest(unittest.TestCase):
# XXX need implementation # XXX need implementation
class BufferedRandom(unittest.TestCase): class BufferedRandomTest(unittest.TestCase):
def testReadAndWrite(self): def testReadAndWrite(self):
raw = MockIO((b"asdf", b"ghjk")) raw = MockIO((b"asdf", b"ghjk"))
@ -331,12 +375,56 @@ class BufferedRandom(unittest.TestCase):
self.assertEquals(7, rw.tell()) self.assertEquals(7, rw.tell())
self.assertEquals(b"fl", rw.read(11)) self.assertEquals(b"fl", rw.read(11))
class TextIOWrapperTest(unittest.TestCase):
def testNewlines(self):
input_lines = [ "unix\n", "windows\r\n", "os9\r", "last\n", "nonl" ]
tests = [
[ None, [ 'unix\n', 'windows\n', 'os9\n', 'last\n', 'nonl' ] ],
[ '\n', input_lines ],
[ '\r\n', input_lines ],
]
encodings = ('utf-8', 'bz2')
# Try a range of pad sizes to test the case where \r is the last
# character in TextIOWrapper._pending_line.
for encoding in encodings:
for do_reads in (False, True):
for padlen in chain(range(10), range(50, 60)):
pad = '.' * padlen
data_lines = [ pad + line for line in input_lines ]
# XXX: str.encode() should return bytes
data = bytes(''.join(data_lines).encode(encoding))
for newline, exp_line_ends in tests:
exp_lines = [ pad + line for line in exp_line_ends ]
bufIo = io.BufferedReader(io.BytesIO(data))
textIo = io.TextIOWrapper(bufIo, newline=newline,
encoding=encoding)
if do_reads:
got_lines = []
while True:
c2 = textIo.read(2)
if c2 == '':
break
self.assertEquals(len(c2), 2)
got_lines.append(c2 + textIo.readline())
else:
got_lines = list(textIo)
for got_line, exp_line in zip(got_lines, exp_lines):
self.assertEquals(got_line, exp_line)
self.assertEquals(len(got_lines), len(exp_lines))
# XXX Tests for open() # XXX Tests for open()
def test_main(): def test_main():
test_support.run_unittest(IOTest, BytesIOTest, BufferedReaderTest, test_support.run_unittest(IOTest, BytesIOTest, StringIOTest,
BufferedReaderTest,
BufferedWriterTest, BufferedRWPairTest, BufferedWriterTest, BufferedRWPairTest,
BufferedRandom) BufferedRandomTest, TextIOWrapperTest)
if __name__ == "__main__": if __name__ == "__main__":
test_main() test_main()