Issue1395: Universal mode used to duplicate newlines when using read(1).

"Universal newline" is now an incremental decoder wrapping the initial one, with its own additional buffer (if '\r' is seen at the end of the input). A decoder allows the tell() funtion to record the state of the translation. This also simplifies the readline() process. Now test_netrc passes on Windows, as well as many new tests in test_io.py
2007-11-19 20:34:10 +00:00 · 2007-11-19 20:34:10 +00:00 · 1ff9910f59
parent 74c29c71b1
commit 1ff9910f59
3 changed files with 237 additions and 98 deletions
--- a/Lib/io.py
+++ b/Lib/io.py
@ -1041,6 +1041,84 @@ class TextIOBase(IOBase):
        return None
 class IncrementalNewlineDecoder(codecs.IncrementalDecoder):
    """Codec used when reading a file in universal newlines mode.
    It wraps another incremental decoder, translating \\r\\n and \\r into \\n.
    It also records the types of newlines encountered.
    When used with translate=False, it ensures that the newline sequence is
    returned in one piece.
    """
    def __init__(self, decoder, translate, errors='strict'):
        codecs.IncrementalDecoder.__init__(self, errors=errors)
        self.buffer = b''
        self.translate = translate
        self.decoder = decoder
        self.seennl = 0
    def decode(self, input, final=False):
        # decode input (with the eventual \r from a previous pass)
        if self.buffer:
            input = self.buffer + input
        output = self.decoder.decode(input, final=final)
        # retain last \r even when not translating data:
        # then readline() is sure to get \r\n in one pass
        if output.endswith("\r") and not final:
            output = output[:-1]
            self.buffer = b'\r'
        else:
            self.buffer = b''
        # Record which newlines are read
        crlf = output.count('\r\n')
        cr = output.count('\r') - crlf
        lf = output.count('\n') - crlf
        self.seennl |= (lf and self._LF) | (cr and self._CR) \
                    | (crlf and self._CRLF)
        if self.translate:
            if crlf:
                output = output.replace("\r\n", "\n")
            if cr:
                output = output.replace("\r", "\n")
        return output
    def getstate(self):
        buf, flag = self.decoder.getstate()
        return buf + self.buffer, flag
    def setstate(self, state):
        buf, flag = state
        if buf.endswith(b'\r'):
            self.buffer = b'\r'
            buf = buf[:-1]
        else:
            self.buffer = b''
        self.decoder.setstate((buf, flag))
    def reset(self):
        self.buffer = b''
        self.decoder.reset()
    _LF = 1
    _CR = 2
    _CRLF = 4
    @property
    def newlines(self):
        return (None,
                "\n",
                "\r",
                ("\r", "\n"),
                "\r\n",
                ("\n", "\r\n"),
                ("\r", "\r\n"),
                ("\r", "\n", "\r\n")
               )[self.seennl]
 class TextIOWrapper(TextIOBase):
    """Buffered text stream.
@ -1077,7 +1155,6 @@ class TextIOWrapper(TextIOBase):
        self._readnl = newline
        self._writetranslate = newline != ''
        self._writenl = newline or os.linesep
        self._seennl = 0
        self._decoder = None
        self._pending = ""
        self._snapshot = None
@ -1124,6 +1201,7 @@ class TextIOWrapper(TextIOBase):
        if not isinstance(s, str):
            raise TypeError("can't write %s to text stream" %
                            s.__class__.__name__)
        length = len(s)
        haslf = "\n" in s
        if haslf and self._writetranslate and self._writenl != "\n":
            s = s.replace("\n", self._writenl)
@ -1132,15 +1210,20 @@ class TextIOWrapper(TextIOBase):
        self.buffer.write(b)
        if haslf and self.isatty():
            self.flush()
-        self._snapshot = self._decoder = None
+        self._snapshot = None
-        return len(s)
+        if self._decoder:
            self._decoder.reset()
        return length
    def _get_decoder(self):
        make_decoder = codecs.getincrementaldecoder(self._encoding)
        if make_decoder is None:
            raise IOError("Can't find an incremental decoder for encoding %s" %
                          self._encoding)
-        decoder = self._decoder = make_decoder()  # XXX: errors
+        decoder = make_decoder()  # XXX: errors
        if self._readuniversal:
            decoder = IncrementalNewlineDecoder(decoder, self._readtranslate)
        self._decoder = decoder
        return decoder
    def _read_chunk(self):
@ -1220,7 +1303,8 @@ class TextIOWrapper(TextIOBase):
            pos = self.buffer.seek(0, 2)
            self._snapshot = None
            self._pending = ""
-            self._decoder = None
+            if self._decoder:
                self._decoder.reset()
            return pos
        if whence != 0:
            raise ValueError("Invalid whence (%r, should be 0, 1 or 2)" %
@ -1234,7 +1318,8 @@ class TextIOWrapper(TextIOBase):
            self.buffer.seek(pos)
            self._snapshot = None
            self._pending = ""
-            self._decoder = None
+            if self._decoder:
                self._decoder.reset()
            return pos
        decoder = self._decoder or self._get_decoder()
        decoder.set_state(("", ds))
@ -1253,7 +1338,7 @@ class TextIOWrapper(TextIOBase):
            res += decoder.decode(self.buffer.read(), True)
            self._pending = ""
            self._snapshot = None
-            return self._replacenl(res)
+            return res
        else:
            while len(res) < n:
                readahead, pending = self._read_chunk()
@ -1261,7 +1346,7 @@ class TextIOWrapper(TextIOBase):
                if not readahead:
                    break
            self._pending = res[n:]
-            return self._replacenl(res[:n])
+            return res[:n]
    def __next__(self):
        self._telling = False
@ -1285,62 +1370,55 @@ class TextIOWrapper(TextIOBase):
        line = self._pending
        start = 0
        cr_eof = False
        decoder = self._decoder or self._get_decoder()
        pos = endpos = None
        ending = None
        while True:
-            if self._readuniversal:
+            if self._readtranslate:
                # Newlines are already translated, only search for \n
                pos = line.find('\n', start)
                if pos >= 0:
                    endpos = pos + 1
                    break
                else:
                    start = len(line)
            elif self._readuniversal:
                # Universal newline search. Find any of \r, \r\n, \n
                # The decoder ensures that \r\n are not split in two pieces
                # In C we'd look for these in parallel of course.
                nlpos = line.find("\n", start)
                crpos = line.find("\r", start)
                if crpos == -1:
                    if nlpos == -1:
                        # Nothing found
                        start = len(line)
                    else:
                        # Found \n
-                        pos = nlpos
+                        endpos = nlpos + 1
                        endpos = pos + 1
                        ending = self._LF
                        break
                elif nlpos == -1:
-                    if crpos == len(line) - 1:
+                    # Found lone \r
-                        # Found \r at end of buffer, must keep reading
+                    endpos = crpos + 1
-                        start = crpos
+                    break
                        cr_eof = True
                    else:
                        # Found lone \r
                        ending = self._CR
                        pos = crpos
                        endpos = pos + 1
                        break
                elif nlpos < crpos:
                    # Found \n
-                    pos = nlpos
+                    endpos = nlpos + 1
                    endpos = pos + 1
                    ending = self._LF
                    break
                elif nlpos == crpos + 1:
                    # Found \r\n
-                    ending = self._CRLF
+                    endpos = crpos + 2
                    pos = crpos
                    endpos = pos + 2
                    break
                else:
                    # Found \r
-                    pos = crpos
+                    endpos = crpos + 1
                    endpos = pos + 1
                    ending = self._CR
                    break
            else:
                # non-universal
                pos = line.find(self._readnl)
                if pos >= 0:
-                    endpos = pos+len(self._readnl)
+                    endpos = pos + len(self._readnl)
                    ending = self._nlflag(self._readnl)
                    break
            # No line ending seen yet - get more data
@ -1356,65 +1434,14 @@ class TextIOWrapper(TextIOBase):
                # end of file
                self._pending = ''
                self._snapshot = None
-                if cr_eof:
+                return line
                    self._seennl |= self._CR
                    return line[:-1] + '\n'
                else:
                    return line
        self._pending = line[endpos:]
-        if self._readtranslate:
+        return line[:endpos]
            self._seennl |= ending
            if ending != self._LF:
                return line[:pos] + '\n'
            else:
                return line[:endpos]
        else:
            return line[:endpos]
    def _replacenl(self, data):
        # Replace newlines in data as needed and record that they have
        # been seen.
        if not self._readtranslate:
            return data
        if self._readuniversal:
            crlf = data.count('\r\n')
            cr = data.count('\r') - crlf
            lf = data.count('\n') - crlf
            self._seennl |= (lf and self._LF) | (cr and self._CR) \
                         | (crlf and self._CRLF)
            if crlf:
                data = data.replace("\r\n", "\n")
            if cr:
                data = data.replace("\r", "\n")
        elif self._readnl == '\n':
            # Only need to detect if \n was seen.
            if data.count('\n'):
                self._seennl |= self._LF
        else:
            newdata = data.replace(self._readnl, '\n')
            if newdata is not data:
                self._seennl |= self._nlflag(self._readnl)
            data = newdata
        return data
    _LF = 1
    _CR = 2
    _CRLF = 4
    @property
    def newlines(self):
-        return (None,
+        return self._decoder.newlines if self._decoder else None
                "\n",
                "\r",
                ("\r", "\n"),
                "\r\n",
                ("\n", "\r\n"),
                ("\r", "\r\n"),
                ("\r", "\n", "\r\n")
               )[self._seennl]
    def _nlflag(self, nlstr):
        return [None, "\n", "\r", None, "\r\n"].index(nlstr)
 class StringIO(TextIOWrapper):
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py
@ -489,6 +489,10 @@ class BufferedRandomTest(unittest.TestCase):
 class TextIOWrapperTest(unittest.TestCase):
    def setUp(self):
        self.testdata = b"AAA\r\nBBB\rCCC\r\nDDD\nEEE\r\n"
        self.normalized = b"AAA\nBBB\nCCC\nDDD\nEEE\n".decode("ascii")
    def tearDown(self):
        test_support.unlink(test_support.TESTFN)
@ -496,14 +500,14 @@ class TextIOWrapperTest(unittest.TestCase):
        testdata = b"AAA\nBBB\nCCC\rDDD\rEEE\r\nFFF\r\nGGG"
        normalized = testdata.replace(b"\r\n", b"\n").replace(b"\r", b"\n")
        for newline, expected in [
-            (None, normalized.decode("ASCII").splitlines(True)),
+            (None, normalized.decode("ascii").splitlines(True)),
-            ("", testdata.decode("ASCII").splitlines(True)),
+            ("", testdata.decode("ascii").splitlines(True)),
            ("\n", ["AAA\n", "BBB\n", "CCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]),
            ("\r\n", ["AAA\nBBB\nCCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]),
            ("\r",  ["AAA\nBBB\nCCC\r", "DDD\r", "EEE\r", "\nFFF\r", "\nGGG"]),
            ]:
            buf = io.BytesIO(testdata)
-            txt = io.TextIOWrapper(buf, encoding="ASCII", newline=newline)
+            txt = io.TextIOWrapper(buf, encoding="ascii", newline=newline)
            self.assertEquals(txt.readlines(), expected)
            txt.seek(0)
            self.assertEquals(txt.read(), "".join(expected))
@ -518,7 +522,7 @@ class TextIOWrapperTest(unittest.TestCase):
        tests = [(None, testdict[os.linesep])] + sorted(testdict.items())
        for newline, expected in tests:
            buf = io.BytesIO()
-            txt = io.TextIOWrapper(buf, encoding="ASCII", newline=newline)
+            txt = io.TextIOWrapper(buf, encoding="ascii", newline=newline)
            txt.write("AAA\nB")
            txt.write("BB\nCCC\n")
            txt.write("X\rY\r\nZ")
@ -568,14 +572,14 @@ class TextIOWrapperTest(unittest.TestCase):
        testdata = b"AAA\nBBB\nCCC\rDDD\rEEE\r\nFFF\r\nGGG"
        normalized = testdata.replace(b"\r\n", b"\n").replace(b"\r", b"\n")
        for newline, expected in [
-            (None, normalized.decode("ASCII").splitlines(True)),
+            (None, normalized.decode("ascii").splitlines(True)),
-            ("", testdata.decode("ASCII").splitlines(True)),
+            ("", testdata.decode("ascii").splitlines(True)),
            ("\n", ["AAA\n", "BBB\n", "CCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]),
            ("\r\n", ["AAA\nBBB\nCCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]),
            ("\r",  ["AAA\nBBB\nCCC\r", "DDD\r", "EEE\r", "\nFFF\r", "\nGGG"]),
            ]:
            buf = io.BytesIO(testdata)
-            txt = io.TextIOWrapper(buf, encoding="ASCII", newline=newline)
+            txt = io.TextIOWrapper(buf, encoding="ascii", newline=newline)
            self.assertEquals(txt.readlines(), expected)
            txt.seek(0)
            self.assertEquals(txt.read(), "".join(expected))
@ -600,7 +604,7 @@ class TextIOWrapperTest(unittest.TestCase):
                ("\r\n", "\r\n", data_crlf),
                ]:
                buf = io.BytesIO()
-                txt = io.TextIOWrapper(buf, encoding="ASCII", newline=newline)
+                txt = io.TextIOWrapper(buf, encoding="ascii", newline=newline)
                txt.write(data)
                txt.close()
                self.assertEquals(buf.getvalue(), expected)
@ -745,6 +749,114 @@ class TextIOWrapperTest(unittest.TestCase):
                print("Reading using readline(): %6.3f seconds" % (t3-t2))
                print("Using readline()+tell():  %6.3f seconds" % (t4-t3))
    def testReadOneByOne(self):
        txt = io.TextIOWrapper(io.BytesIO(b"AA\r\nBB"))
        reads = ""
        while True:
            c = txt.read(1)
            if not c:
                break
            reads += c
        self.assertEquals(reads, "AA\nBB")
    # read in amounts equal to TextIOWrapper._CHUNK_SIZE which is 128.
    def testReadByChunk(self):
        # make sure "\r\n" straddles 128 char boundary.
        txt = io.TextIOWrapper(io.BytesIO(b"A" * 127 + b"\r\nB"))
        reads = ""
        while True:
            c = txt.read(128)
            if not c:
                break
            reads += c
        self.assertEquals(reads, "A"*127+"\nB")
    def test_issue1395_1(self):
        txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
        # read one char at a time
        reads = ""
        while True:
            c = txt.read(1)
            if not c:
                break
            reads += c
        self.assertEquals(reads, self.normalized)
    def test_issue1395_2(self):
        txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
        txt._CHUNK_SIZE = 4
        reads = ""
        while True:
            c = txt.read(4)
            if not c:
                break
            reads += c
        self.assertEquals(reads, self.normalized)
    def test_issue1395_3(self):
        txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
        txt._CHUNK_SIZE = 4
        reads = txt.read(4)
        reads += txt.read(4)
        reads += txt.readline()
        reads += txt.readline()
        reads += txt.readline()
        self.assertEquals(reads, self.normalized)
    def test_issue1395_4(self):
        txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
        txt._CHUNK_SIZE = 4
        reads = txt.read(4)
        reads += txt.read()
        self.assertEquals(reads, self.normalized)
    def test_issue1395_5(self):
        txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
        txt._CHUNK_SIZE = 4
        reads = txt.read(4)
        pos = txt.tell()
        txt.seek(0)
        txt.seek(pos)
        self.assertEquals(txt.read(4), "BBB\n")
    def test_newline_decoder(self):
        import codecs
        decoder = codecs.getincrementaldecoder("utf-8")()
        decoder = io.IncrementalNewlineDecoder(decoder, translate=True)
        self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), "\u8888")
        self.assertEquals(decoder.decode(b'\xe8'), "")
        self.assertEquals(decoder.decode(b'\xa2'), "")
        self.assertEquals(decoder.decode(b'\x88'), "\u8888")
        self.assertEquals(decoder.decode(b'\xe8'), "")
        self.assertRaises(UnicodeDecodeError, decoder.decode, b'', final=True)
        decoder.setstate((b'', 0))
        self.assertEquals(decoder.decode(b'\n'), "\n")
        self.assertEquals(decoder.decode(b'\r'), "")
        self.assertEquals(decoder.decode(b'', final=True), "\n")
        self.assertEquals(decoder.decode(b'\r', final=True), "\n")
        self.assertEquals(decoder.decode(b'\r'), "")
        self.assertEquals(decoder.decode(b'a'), "\na")
        self.assertEquals(decoder.decode(b'\r\r\n'), "\n\n")
        self.assertEquals(decoder.decode(b'\r'), "")
        self.assertEquals(decoder.decode(b'\r'), "\n")
        self.assertEquals(decoder.decode(b'\na'), "\na")
        self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r\n'), "\u8888\n")
        self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), "\u8888")
        self.assertEquals(decoder.decode(b'\n'), "\n")
        self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r'), "\u8888")
        self.assertEquals(decoder.decode(b'\n'), "\n")
 # XXX Tests for open()
--- a/Modules/_fileio.c
+++ b/Modules/_fileio.c
@ -867,7 +867,7 @@ static PyGetSetDef fileio_getsetlist[] = {
 PyTypeObject PyFileIO_Type = {
 	PyVarObject_HEAD_INIT(&PyType_Type, 0)
-	"FileIO",
+	"_FileIO",
 	sizeof(PyFileIOObject),
 	0,
 	(destructor)fileio_dealloc,		/* tp_dealloc */