Added a working Text I/O layer, by Mark Russell.

This is essentially a checkpoint.
2007-04-06 17:31:18 +00:00 · 2007-04-06 17:31:18 +00:00 · 78892e4613
parent 0e074483e7
commit 78892e4613
2 changed files with 366 additions and 47 deletions
--- a/Lib/io.py
+++ b/Lib/io.py
@ -3,7 +3,7 @@
 This is an early prototype; eventually some of this will be
 reimplemented in C and the rest may be turned into a package.

-See PEP XXX; for now: http://docs.google.com/Doc?id=dfksfvqd_1cn5g5m
+See PEP 3116.

 XXX need to default buffer size to 1 if isatty()
 XXX need to support 1 meaning line-buffered
@ -11,20 +11,24 @@ XXX change behavior of blocking I/O
 """

 __author__ = ("Guido van Rossum <guido@python.org>, "
-              "Mike Verdone <mike.verdone@gmail.com>")
+              "Mike Verdone <mike.verdone@gmail.com>, "
+              "Mark Russell <mark.russell@zen.co.uk>")

 __all__ = ["open", "RawIOBase", "FileIO", "SocketIO", "BytesIO",
           "BufferedReader", "BufferedWriter", "BufferedRWPair",
-           "BufferedRandom", "EOF"]
+           "BufferedRandom"]

 import os
+import sys
+import codecs
+import warnings

 DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes
 DEFAULT_MAX_BUFFER_SIZE = 16 * 1024 # bytes
-EOF = b''  # XXX This is wrong because it's mutable


 class BlockingIO(IOError):
+
    def __init__(self, errno, strerror, characters_written):
        IOError.__init__(self, errno, strerror)
        self.characters_written = characters_written
@ -125,14 +129,12 @@ class RawIOBase:
    seeked.

    The read() method is implemented by calling readinto(); derived
-    classes that want to support readon only need to implement
+    classes that want to support read() only need to implement
    readinto() as a primitive operation.
    """

-    # XXX Add individual method docstrings
-
    def read(self, n):
-        """Read and return up to n bytes.
+        """read(n: int) -> bytes.  Read and return up to n bytes.

        Returns an empty bytes array on EOF, or None if the object is
        set not to block and has no data to read.
@ -143,43 +145,80 @@ class RawIOBase:
        return b

    def readinto(self, b):
+        """readinto(b: bytes) -> None.  Read up to len(b) bytes into b.
+
+        Returns number of bytes read (0 for EOF), or None if the object
+        is set not to block as has no data to read.
+        """
        raise IOError(".readinto() not supported")

    def write(self, b):
-        """Write the given buffer to the IO stream.
+        """write(b: bytes) -> int.  Write the given buffer to the IO stream.

-        Returns the number of bytes written.
+        Returns the number of bytes written, which may be less than len(b).
        """
        raise IOError(".write() not supported")

    def seek(self, pos, whence=0):
+        """seek(pos: int, whence: int = 0) -> None.  Change stream position.
+
+        Seek to byte offset pos relative to position indicated by whence:
+             0  Start of stream (the default).  pos should be >= 0;
+             1  Current position - whence may be negative;
+             2  End of stream - whence usually negative.
+        """
        raise IOError(".seek() not supported")

    def tell(self):
+        """tell() -> int.  Return current stream position."""
        raise IOError(".tell() not supported")

    def truncate(self, pos=None):
+        """truncate(size: int = None) -> None. Truncate file to size bytes.
+
+        Size defaults to the current IO position as reported by tell().
+        """
        raise IOError(".truncate() not supported")

    def close(self):
+        """close() -> None.  Close IO object."""
        pass

    def seekable(self):
+        """seekable() -> bool.  Return whether object supports random access.
+
+        If False, seek(), tell() and truncate() will raise IOError.
+        This method may need to do a test seek().
+        """
        return False

    def readable(self):
+        """readable() -> bool.  Return whether object was opened for reading.
+
+        If False, read() will raise IOError.
+        """
        return False

    def writable(self):
+        """writable() -> bool.  Return whether object was opened for writing.
+
+        If False, write() and truncate() will raise IOError.
+        """
        return False

    def __enter__(self):
+        """Context management protocol.  Returns self."""
        return self

    def __exit__(self, *args):
+        """Context management protocol.  Same as close()"""
        self.close()

    def fileno(self):
+        """fileno() -> int.  Return underlying file descriptor if there is one.
+
+        Raises IOError if the IO object does not use a file descriptor.
+        """
        raise IOError(".fileno() not supported")


@ -252,6 +291,8 @@ try:
    import _fileio
 except ImportError:
    # Let's use the Python version
+    warnings.warn("Can't import _fileio, using slower Python lookalike",
+                  RuntimeWarning)
    FileIO = _PyFileIO
 else:
    # Create a trivial subclass with the proper inheritance structure
@ -295,17 +336,13 @@ class BufferedIOBase(RawIOBase):
    """XXX Docstring."""


-class BytesIO(BufferedIOBase):
+class _MemoryBufferMixin:

-    """Buffered I/O implementation using a bytes buffer, like StringIO."""
+    # XXX docstring

-    # XXX More docs
-
-    def __init__(self, inital_bytes=None):
-        self._buffer = b""
+    def __init__(self, buffer):
+        self._buffer = buffer
        self._pos = 0
-        if inital_bytes is not None:
-            self._buffer += inital_bytes

    def getvalue(self):
        return self._buffer
@ -362,6 +399,35 @@ class BytesIO(BufferedIOBase):
        return True


+class BytesIO(_MemoryBufferMixin, BufferedIOBase):
+
+    """Buffered I/O implementation using a bytes buffer, like StringIO."""
+
+    # XXX More docs
+
+    def __init__(self, inital_bytes=None):
+        buffer = b""
+        if inital_bytes is not None:
+            buffer += inital_bytes
+        _MemoryBufferMixin.__init__(self, buffer)
+
+
+class StringIO(_MemoryBufferMixin, BufferedIOBase):
+
+    """Buffered I/O implementation using a string buffer, like StringIO."""
+
+    # XXX More docs
+
+    # XXX Reuses the same code as BytesIO, just with a string rather
+    # that bytes as the _buffer value.  That won't work in C of course.
+
+    def __init__(self, inital_string=None):
+        buffer = ""
+        if inital_string is not None:
+            buffer += inital_string
+        _MemoryBufferMixin.__init__(self, buffer)
+
+
 class BufferedIOBase(RawIOBase):

    """Base class for buffered IO objects."""
@ -375,15 +441,17 @@ class BufferedReader(BufferedIOBase):

    """Buffer for a readable sequential RawIO object.

-    Does not allow random access (seek, tell).
+    Does not allow random access (seek, tell).  (Use BufferedRandom
+    for that.)
    """

-    def __init__(self, raw, unused_buffer_size=None):
+    def __init__(self, raw, buffer_size=DEFAULT_BUFFER_SIZE):
        """Create a new buffered reader using the given readable raw IO object.
        """
        assert raw.readable()
        self.raw = raw
        self._read_buf = b""
+        self.buffer_size = buffer_size
        if hasattr(raw, 'fileno'):
            self.fileno = raw.fileno

@ -395,11 +463,13 @@ class BufferedReader(BufferedIOBase):
        mode. If n is None, read until EOF or until read() would
        block.
        """
-        assert n is None or n > 0
-        nodata_val = EOF
-        while (len(self._read_buf) < n) if (n is not None) else True:
-            current = self.raw.read(n)
-            if current in (EOF, None):
+        assert n is None or n > 0, '.read(): Bad read size %r' % n
+        nodata_val = b""
+        while n is None or len(self._read_buf) < n:
+            to_read = None if n is None else max(n, self.buffer_size)
+            current = self.raw.read(to_read)
+
+            if current in (b"", None):
                nodata_val = current
                break
            self._read_buf += current
@ -428,6 +498,8 @@ class BufferedReader(BufferedIOBase):

 class BufferedWriter(BufferedIOBase):

+    # XXX docstring
+
    def __init__(self, raw, buffer_size=DEFAULT_BUFFER_SIZE,
                 max_buffer_size=DEFAULT_MAX_BUFFER_SIZE):
        assert raw.writable()
@ -488,6 +560,8 @@ class BufferedRWPair(BufferedReader, BufferedWriter):

    A buffered reader object and buffered writer object put together to
    form a sequential IO object that can read and write.
+
+    This is typically used with a socket or two-way pipe.
    """

    def __init__(self, reader, writer, buffer_size=DEFAULT_BUFFER_SIZE,
@ -528,6 +602,8 @@ class BufferedRWPair(BufferedReader, BufferedWriter):

 class BufferedRandom(BufferedReader, BufferedWriter):

+    # XXX docstring
+
    def __init__(self, raw, buffer_size=DEFAULT_BUFFER_SIZE,
                 max_buffer_size=DEFAULT_MAX_BUFFER_SIZE):
        assert raw.seekable()
@ -561,6 +637,8 @@ class BufferedRandom(BufferedReader, BufferedWriter):
        return BufferedReader.read(self, n)

    def write(self, b):
+        if self._read_buf:
+            self.raw.seek(-len(self._read_buf), 1) # Undo readahead
            self._read_buf = b""
        return BufferedWriter.write(self, b)

@ -569,3 +647,156 @@ class BufferedRandom(BufferedReader, BufferedWriter):

    def close(self):
        self.raw.close()
+
+
+class TextIOBase(BufferedIOBase):
+
+    """Base class for text I/O.
+
+    This class provides a character and line based interface to stream I/O.
+    """
+
+    def read(self, n: int = -1) -> str:
+        """read(n: int = -1) -> str.  Read at most n characters from stream.
+
+        Read from underlying buffer until we have n characters or we hit EOF.
+        If n is negative or omitted, read until EOF.
+        """
+        raise IOError(".read() not supported")
+
+    def write(self, s: str):
+        """write(s: str) -> None.  Write string s to stream.
+        """
+        raise IOError(".write() not supported")
+
+    def readline(self) -> str:
+        """readline() -> str.  Read until newline or EOF.
+
+        Returns an empty string if EOF is hit immediately.
+        """
+        raise IOError(".readline() not supported")
+
+    def __iter__(self):
+        """__iter__() -> Iterator.  Return line iterator (actually just self).
+        """
+        return self
+
+    def next(self):
+        """Same as readline() except raises StopIteration on immediate EOF.
+        """
+        line = self.readline()
+        if line == '':
+            raise StopIteration
+        return line
+
+
+class TextIOWrapper(TextIOBase):
+
+    """Buffered text stream.
+
+    Character and line based layer over a BufferedIOBase object.
+    """
+
+    # XXX tell(), seek()
+
+    def __init__(self, buffer, encoding=None, newline=None):
+        if newline not in (None, '\n', '\r\n'):
+            raise IOError("illegal newline %s" % newline) # XXX: ValueError?
+        if encoding is None:
+            # XXX This is questionable
+            encoding = sys.getfilesystemencoding()
+            if encoding is None:
+                encoding = "latin-1"  # XXX, but this is best for transparancy
+
+        self.buffer = buffer
+        self._encoding = encoding
+        self._newline = newline or os.linesep
+        self._fix_newlines = newline is None
+        self._decoder = None
+        self._pending = ''
+
+    def write(self, s: str):
+        return self.buffer.write(s.encode(self._encoding))
+
+    def _get_decoder(self):
+        make_decoder = codecs.getincrementaldecoder(self._encoding)
+        if make_decoder is None:
+            raise IOError(".readline() not supported for encoding %s" %
+                          self._encoding)
+        decoder = self._decoder = make_decoder()  # XXX: errors
+        if isinstance(decoder, codecs.BufferedIncrementalDecoder):
+            # XXX Hack: make the codec use bytes instead of strings
+            decoder.buffer = b""
+        return decoder
+
+    def read(self, n: int = -1):
+        decoder = self._decoder or self._get_decoder()
+        res = self._pending
+        if n < 0:
+            res += decoder.decode(self.buffer.read(), True)
+            self._pending = ''
+            return res
+        else:
+            while len(res) < n:
+                data = self.buffer.read(64)
+                res += decoder.decode(data, not data)
+                if not data:
+                    break
+            self._pending = res[n:]
+            return res[:n]
+
+    def readline(self):
+        line = self._pending
+        start = 0
+        decoder = self._decoder or self._get_decoder()
+
+        while True:
+            # In C we'd look for these in parallel of course.
+            nlpos = line.find("\n", start)
+            crpos = line.find("\r", start)
+            if nlpos >= 0 and crpos >= 0:
+                endpos = min(nlpos, crpos)
+            else:
+                endpos = nlpos if nlpos >= 0 else crpos
+
+            if endpos != -1:
+                endc = line[endpos]
+                if endc == "\n":
+                    ending = "\n"
+                    break
+
+                # We've seen \r - is it standalone, \r\n or \r at end of line?
+                if endpos + 1 < len(line):
+                    if line[endpos+1] == '\n':
+                        ending = "\r\n"
+                    else:
+                        ending = "\r"
+                    break
+                # There might be a following \n in the next block of data ...
+                start = endpos
+            else:
+                start = len(line)
+
+            # No line ending seen yet - get more data
+            while True:
+                data = self.buffer.read(64)
+                more_line = decoder.decode(data, not data)
+                if more_line != "" or not data:
+                    break
+
+            if more_line == "":
+                ending = ''
+                endpos = len(line)
+                break
+
+            line += more_line
+
+        nextpos = endpos + len(ending)
+        self._pending = line[nextpos:]
+
+        # XXX Update self.newlines here if we want to support that
+
+        if self._fix_newlines and ending != "\n" and ending != '':
+            return line[:endpos] + "\n"
+        else:
+            return line[:nextpos]
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py
@ -2,7 +2,7 @@

 import unittest
 from test import test_support
-
+from itertools import chain
 import io


@ -16,7 +16,7 @@ class MockIO(io.RawIOBase):
        try:
            return self._readStack.pop(0)
        except:
-            return io.EOF
+            return b""

    def write(self, b):
        self._writeStack.append(b)
@ -41,6 +41,18 @@ class MockIO(io.RawIOBase):
        return 42


+class MockFileIO(io.BytesIO):
+
+    def __init__(self, data):
+        self.read_history = []
+        io.BytesIO.__init__(self, data)
+
+    def read(self, n=None):
+        res = io.BytesIO.read(self, n)
+        self.read_history.append(None if res is None else len(res))
+        return res
+
+
 class MockNonBlockWriterIO(io.RawIOBase):

    def __init__(self, blockingScript):
@ -147,31 +159,31 @@ class IOTest(unittest.TestCase):
        f.close()


-class BytesIOTest(unittest.TestCase):
+class MemorySeekTest(unittest.TestCase):

    def testInit(self):
-        buf = b"1234567890"
-        bytesIo = io.BytesIO(buf)
+        buf = self.buftype("1234567890")
+        bytesIo = self.ioclass(buf)

    def testRead(self):
-        buf = b"1234567890"
-        bytesIo = io.BytesIO(buf)
+        buf = self.buftype("1234567890")
+        bytesIo = self.ioclass(buf)

        self.assertEquals(buf[:1], bytesIo.read(1))
        self.assertEquals(buf[1:5], bytesIo.read(4))
        self.assertEquals(buf[5:], bytesIo.read(900))
-        self.assertEquals(io.EOF, bytesIo.read())
+        self.assertEquals(self.EOF, bytesIo.read())

    def testReadNoArgs(self):
-        buf = b"1234567890"
-        bytesIo = io.BytesIO(buf)
+        buf = self.buftype("1234567890")
+        bytesIo = self.ioclass(buf)

        self.assertEquals(buf, bytesIo.read())
-        self.assertEquals(io.EOF, bytesIo.read())
+        self.assertEquals(self.EOF, bytesIo.read())

    def testSeek(self):
-        buf = b"1234567890"
-        bytesIo = io.BytesIO(buf)
+        buf = self.buftype("1234567890")
+        bytesIo = self.ioclass(buf)

        bytesIo.read(5)
        bytesIo.seek(0)
@ -181,8 +193,8 @@ class BytesIOTest(unittest.TestCase):
        self.assertEquals(buf[3:], bytesIo.read())

    def testTell(self):
-        buf = b"1234567890"
-        bytesIo = io.BytesIO(buf)
+        buf = self.buftype("1234567890")
+        bytesIo = self.ioclass(buf)

        self.assertEquals(0, bytesIo.tell())
        bytesIo.seek(5)
@ -191,6 +203,18 @@ class BytesIOTest(unittest.TestCase):
        self.assertEquals(10000, bytesIo.tell())


+class BytesIOTest(MemorySeekTest):
+    buftype = bytes
+    ioclass = io.BytesIO
+    EOF = b""
+
+
+class StringIOTest(MemorySeekTest):
+    buftype = str
+    ioclass = io.StringIO
+    EOF = ""
+
+
 class BufferedReaderTest(unittest.TestCase):

    def testRead(self):
@ -199,6 +223,25 @@ class BufferedReaderTest(unittest.TestCase):

        self.assertEquals(b"abcdef", bufIo.read(6))

+    def testBuffering(self):
+        data = b"abcdefghi"
+        dlen = len(data)
+
+        tests = [
+            [ 100, [ 3, 1, 4, 8 ], [ dlen, 0 ] ],
+            [ 100, [ 3, 3, 3],     [ dlen ]    ],
+            [   4, [ 1, 2, 4, 2 ], [ 4, 4, 1 ] ],
+        ]
+
+        for bufsize, buf_read_sizes, raw_read_sizes in tests:
+            rawIo = MockFileIO(data)
+            bufIo = io.BufferedReader(rawIo, buffer_size=bufsize)
+            pos = 0
+            for nbytes in buf_read_sizes:
+                self.assertEquals(bufIo.read(nbytes), data[pos:pos+nbytes])
+                pos += nbytes
+            self.assertEquals(rawIo.read_history, raw_read_sizes)
+
    def testReadNonBlocking(self):
        # Inject some None's in there to simulate EWOULDBLOCK
        rawIo = MockIO((b"abc", b"d", None, b"efg", None, None))
@ -208,7 +251,7 @@ class BufferedReaderTest(unittest.TestCase):
        self.assertEquals(b"e", bufIo.read(1))
        self.assertEquals(b"fg", bufIo.read())
        self.assert_(None is bufIo.read())
-        self.assertEquals(io.EOF, bufIo.read())
+        self.assertEquals(b"", bufIo.read())

    def testReadToEof(self):
        rawIo = MockIO((b"abc", b"d", b"efg"))
@ -270,8 +313,9 @@ class BufferedWriterTest(unittest.TestCase):

        bufIo.write(b"asdfasdfasdf")

-        # XXX I don't like this test. It relies too heavily on how the algorithm
-        # actually works, which we might change. Refactor later.
+        # XXX I don't like this test. It relies too heavily on how the
+        # algorithm actually works, which we might change. Refactor
+        # later.

    def testFileno(self):
        rawIo = MockIO((b"abc", b"d", b"efg"))
@ -299,7 +343,7 @@ class BufferedRWPairTest(unittest.TestCase):
        # XXX need implementation


-class BufferedRandom(unittest.TestCase):
+class BufferedRandomTest(unittest.TestCase):

    def testReadAndWrite(self):
        raw = MockIO((b"asdf", b"ghjk"))
@ -331,12 +375,56 @@ class BufferedRandom(unittest.TestCase):
        self.assertEquals(7, rw.tell())
        self.assertEquals(b"fl", rw.read(11))

+
+class TextIOWrapperTest(unittest.TestCase):
+    def testNewlines(self):
+        input_lines = [ "unix\n", "windows\r\n", "os9\r", "last\n", "nonl" ]
+
+        tests = [
+            [ None, [ 'unix\n', 'windows\n', 'os9\n', 'last\n', 'nonl' ] ],
+            [ '\n', input_lines ],
+            [ '\r\n', input_lines ],
+        ]
+
+        encodings = ('utf-8', 'bz2')
+
+        # Try a range of pad sizes to test the case where \r is the last
+        # character in TextIOWrapper._pending_line.
+        for encoding in encodings:
+            for do_reads in (False, True):
+                for padlen in chain(range(10), range(50, 60)):
+                    pad = '.' * padlen
+                    data_lines = [ pad + line for line in input_lines ]
+                    # XXX: str.encode() should return bytes
+                    data = bytes(''.join(data_lines).encode(encoding))
+
+                    for newline, exp_line_ends in tests:
+                        exp_lines = [ pad + line for line in exp_line_ends ]
+                        bufIo = io.BufferedReader(io.BytesIO(data))
+                        textIo = io.TextIOWrapper(bufIo, newline=newline,
+                                                  encoding=encoding)
+                        if do_reads:
+                            got_lines = []
+                            while True:
+                                c2 = textIo.read(2)
+                                if c2 == '':
+                                    break
+                                self.assertEquals(len(c2), 2)
+                                got_lines.append(c2 + textIo.readline())
+                        else:
+                            got_lines = list(textIo)
+
+                        for got_line, exp_line in zip(got_lines, exp_lines):
+                            self.assertEquals(got_line, exp_line)
+                        self.assertEquals(len(got_lines), len(exp_lines))
+
 # XXX Tests for open()

 def test_main():
-    test_support.run_unittest(IOTest, BytesIOTest, BufferedReaderTest,
+    test_support.run_unittest(IOTest, BytesIOTest, StringIOTest,
+                              BufferedReaderTest,
                              BufferedWriterTest, BufferedRWPairTest,
-                              BufferedRandom)
+                              BufferedRandomTest, TextIOWrapperTest)

 if __name__ == "__main__":
    test_main()