Issue1395: Universal mode used to duplicate newlines when using read(1).

"Universal newline" is now an incremental decoder wrapping the initial one,
with its own additional buffer (if '\r' is seen at the end of the input).

A decoder allows the tell() funtion to record the state of the translation.
This also simplifies the readline() process.

Now test_netrc passes on Windows, as well as many new tests in test_io.py
This commit is contained in:
Amaury Forgeot d'Arc 2007-11-19 20:34:10 +00:00
parent 74c29c71b1
commit 1ff9910f59
3 changed files with 237 additions and 98 deletions

195
Lib/io.py
View File

@ -1041,6 +1041,84 @@ class TextIOBase(IOBase):
return None return None
class IncrementalNewlineDecoder(codecs.IncrementalDecoder):
"""Codec used when reading a file in universal newlines mode.
It wraps another incremental decoder, translating \\r\\n and \\r into \\n.
It also records the types of newlines encountered.
When used with translate=False, it ensures that the newline sequence is
returned in one piece.
"""
def __init__(self, decoder, translate, errors='strict'):
codecs.IncrementalDecoder.__init__(self, errors=errors)
self.buffer = b''
self.translate = translate
self.decoder = decoder
self.seennl = 0
def decode(self, input, final=False):
# decode input (with the eventual \r from a previous pass)
if self.buffer:
input = self.buffer + input
output = self.decoder.decode(input, final=final)
# retain last \r even when not translating data:
# then readline() is sure to get \r\n in one pass
if output.endswith("\r") and not final:
output = output[:-1]
self.buffer = b'\r'
else:
self.buffer = b''
# Record which newlines are read
crlf = output.count('\r\n')
cr = output.count('\r') - crlf
lf = output.count('\n') - crlf
self.seennl |= (lf and self._LF) | (cr and self._CR) \
| (crlf and self._CRLF)
if self.translate:
if crlf:
output = output.replace("\r\n", "\n")
if cr:
output = output.replace("\r", "\n")
return output
def getstate(self):
buf, flag = self.decoder.getstate()
return buf + self.buffer, flag
def setstate(self, state):
buf, flag = state
if buf.endswith(b'\r'):
self.buffer = b'\r'
buf = buf[:-1]
else:
self.buffer = b''
self.decoder.setstate((buf, flag))
def reset(self):
self.buffer = b''
self.decoder.reset()
_LF = 1
_CR = 2
_CRLF = 4
@property
def newlines(self):
return (None,
"\n",
"\r",
("\r", "\n"),
"\r\n",
("\n", "\r\n"),
("\r", "\r\n"),
("\r", "\n", "\r\n")
)[self.seennl]
class TextIOWrapper(TextIOBase): class TextIOWrapper(TextIOBase):
"""Buffered text stream. """Buffered text stream.
@ -1077,7 +1155,6 @@ class TextIOWrapper(TextIOBase):
self._readnl = newline self._readnl = newline
self._writetranslate = newline != '' self._writetranslate = newline != ''
self._writenl = newline or os.linesep self._writenl = newline or os.linesep
self._seennl = 0
self._decoder = None self._decoder = None
self._pending = "" self._pending = ""
self._snapshot = None self._snapshot = None
@ -1124,6 +1201,7 @@ class TextIOWrapper(TextIOBase):
if not isinstance(s, str): if not isinstance(s, str):
raise TypeError("can't write %s to text stream" % raise TypeError("can't write %s to text stream" %
s.__class__.__name__) s.__class__.__name__)
length = len(s)
haslf = "\n" in s haslf = "\n" in s
if haslf and self._writetranslate and self._writenl != "\n": if haslf and self._writetranslate and self._writenl != "\n":
s = s.replace("\n", self._writenl) s = s.replace("\n", self._writenl)
@ -1132,15 +1210,20 @@ class TextIOWrapper(TextIOBase):
self.buffer.write(b) self.buffer.write(b)
if haslf and self.isatty(): if haslf and self.isatty():
self.flush() self.flush()
self._snapshot = self._decoder = None self._snapshot = None
return len(s) if self._decoder:
self._decoder.reset()
return length
def _get_decoder(self): def _get_decoder(self):
make_decoder = codecs.getincrementaldecoder(self._encoding) make_decoder = codecs.getincrementaldecoder(self._encoding)
if make_decoder is None: if make_decoder is None:
raise IOError("Can't find an incremental decoder for encoding %s" % raise IOError("Can't find an incremental decoder for encoding %s" %
self._encoding) self._encoding)
decoder = self._decoder = make_decoder() # XXX: errors decoder = make_decoder() # XXX: errors
if self._readuniversal:
decoder = IncrementalNewlineDecoder(decoder, self._readtranslate)
self._decoder = decoder
return decoder return decoder
def _read_chunk(self): def _read_chunk(self):
@ -1220,7 +1303,8 @@ class TextIOWrapper(TextIOBase):
pos = self.buffer.seek(0, 2) pos = self.buffer.seek(0, 2)
self._snapshot = None self._snapshot = None
self._pending = "" self._pending = ""
self._decoder = None if self._decoder:
self._decoder.reset()
return pos return pos
if whence != 0: if whence != 0:
raise ValueError("Invalid whence (%r, should be 0, 1 or 2)" % raise ValueError("Invalid whence (%r, should be 0, 1 or 2)" %
@ -1234,7 +1318,8 @@ class TextIOWrapper(TextIOBase):
self.buffer.seek(pos) self.buffer.seek(pos)
self._snapshot = None self._snapshot = None
self._pending = "" self._pending = ""
self._decoder = None if self._decoder:
self._decoder.reset()
return pos return pos
decoder = self._decoder or self._get_decoder() decoder = self._decoder or self._get_decoder()
decoder.set_state(("", ds)) decoder.set_state(("", ds))
@ -1253,7 +1338,7 @@ class TextIOWrapper(TextIOBase):
res += decoder.decode(self.buffer.read(), True) res += decoder.decode(self.buffer.read(), True)
self._pending = "" self._pending = ""
self._snapshot = None self._snapshot = None
return self._replacenl(res) return res
else: else:
while len(res) < n: while len(res) < n:
readahead, pending = self._read_chunk() readahead, pending = self._read_chunk()
@ -1261,7 +1346,7 @@ class TextIOWrapper(TextIOBase):
if not readahead: if not readahead:
break break
self._pending = res[n:] self._pending = res[n:]
return self._replacenl(res[:n]) return res[:n]
def __next__(self): def __next__(self):
self._telling = False self._telling = False
@ -1285,62 +1370,55 @@ class TextIOWrapper(TextIOBase):
line = self._pending line = self._pending
start = 0 start = 0
cr_eof = False
decoder = self._decoder or self._get_decoder() decoder = self._decoder or self._get_decoder()
pos = endpos = None pos = endpos = None
ending = None
while True: while True:
if self._readuniversal: if self._readtranslate:
# Newlines are already translated, only search for \n
pos = line.find('\n', start)
if pos >= 0:
endpos = pos + 1
break
else:
start = len(line)
elif self._readuniversal:
# Universal newline search. Find any of \r, \r\n, \n # Universal newline search. Find any of \r, \r\n, \n
# The decoder ensures that \r\n are not split in two pieces
# In C we'd look for these in parallel of course. # In C we'd look for these in parallel of course.
nlpos = line.find("\n", start) nlpos = line.find("\n", start)
crpos = line.find("\r", start) crpos = line.find("\r", start)
if crpos == -1: if crpos == -1:
if nlpos == -1: if nlpos == -1:
# Nothing found
start = len(line) start = len(line)
else: else:
# Found \n # Found \n
pos = nlpos endpos = nlpos + 1
endpos = pos + 1
ending = self._LF
break break
elif nlpos == -1: elif nlpos == -1:
if crpos == len(line) - 1:
# Found \r at end of buffer, must keep reading
start = crpos
cr_eof = True
else:
# Found lone \r # Found lone \r
ending = self._CR endpos = crpos + 1
pos = crpos
endpos = pos + 1
break break
elif nlpos < crpos: elif nlpos < crpos:
# Found \n # Found \n
pos = nlpos endpos = nlpos + 1
endpos = pos + 1
ending = self._LF
break break
elif nlpos == crpos + 1: elif nlpos == crpos + 1:
# Found \r\n # Found \r\n
ending = self._CRLF endpos = crpos + 2
pos = crpos
endpos = pos + 2
break break
else: else:
# Found \r # Found \r
pos = crpos endpos = crpos + 1
endpos = pos + 1
ending = self._CR
break break
else: else:
# non-universal # non-universal
pos = line.find(self._readnl) pos = line.find(self._readnl)
if pos >= 0: if pos >= 0:
endpos = pos + len(self._readnl) endpos = pos + len(self._readnl)
ending = self._nlflag(self._readnl)
break break
# No line ending seen yet - get more data # No line ending seen yet - get more data
@ -1356,65 +1434,14 @@ class TextIOWrapper(TextIOBase):
# end of file # end of file
self._pending = '' self._pending = ''
self._snapshot = None self._snapshot = None
if cr_eof:
self._seennl |= self._CR
return line[:-1] + '\n'
else:
return line return line
self._pending = line[endpos:] self._pending = line[endpos:]
if self._readtranslate:
self._seennl |= ending
if ending != self._LF:
return line[:pos] + '\n'
else:
return line[:endpos]
else:
return line[:endpos] return line[:endpos]
def _replacenl(self, data):
# Replace newlines in data as needed and record that they have
# been seen.
if not self._readtranslate:
return data
if self._readuniversal:
crlf = data.count('\r\n')
cr = data.count('\r') - crlf
lf = data.count('\n') - crlf
self._seennl |= (lf and self._LF) | (cr and self._CR) \
| (crlf and self._CRLF)
if crlf:
data = data.replace("\r\n", "\n")
if cr:
data = data.replace("\r", "\n")
elif self._readnl == '\n':
# Only need to detect if \n was seen.
if data.count('\n'):
self._seennl |= self._LF
else:
newdata = data.replace(self._readnl, '\n')
if newdata is not data:
self._seennl |= self._nlflag(self._readnl)
data = newdata
return data
_LF = 1
_CR = 2
_CRLF = 4
@property @property
def newlines(self): def newlines(self):
return (None, return self._decoder.newlines if self._decoder else None
"\n",
"\r",
("\r", "\n"),
"\r\n",
("\n", "\r\n"),
("\r", "\r\n"),
("\r", "\n", "\r\n")
)[self._seennl]
def _nlflag(self, nlstr):
return [None, "\n", "\r", None, "\r\n"].index(nlstr)
class StringIO(TextIOWrapper): class StringIO(TextIOWrapper):

View File

@ -489,6 +489,10 @@ class BufferedRandomTest(unittest.TestCase):
class TextIOWrapperTest(unittest.TestCase): class TextIOWrapperTest(unittest.TestCase):
def setUp(self):
self.testdata = b"AAA\r\nBBB\rCCC\r\nDDD\nEEE\r\n"
self.normalized = b"AAA\nBBB\nCCC\nDDD\nEEE\n".decode("ascii")
def tearDown(self): def tearDown(self):
test_support.unlink(test_support.TESTFN) test_support.unlink(test_support.TESTFN)
@ -496,14 +500,14 @@ class TextIOWrapperTest(unittest.TestCase):
testdata = b"AAA\nBBB\nCCC\rDDD\rEEE\r\nFFF\r\nGGG" testdata = b"AAA\nBBB\nCCC\rDDD\rEEE\r\nFFF\r\nGGG"
normalized = testdata.replace(b"\r\n", b"\n").replace(b"\r", b"\n") normalized = testdata.replace(b"\r\n", b"\n").replace(b"\r", b"\n")
for newline, expected in [ for newline, expected in [
(None, normalized.decode("ASCII").splitlines(True)), (None, normalized.decode("ascii").splitlines(True)),
("", testdata.decode("ASCII").splitlines(True)), ("", testdata.decode("ascii").splitlines(True)),
("\n", ["AAA\n", "BBB\n", "CCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]), ("\n", ["AAA\n", "BBB\n", "CCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]),
("\r\n", ["AAA\nBBB\nCCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]), ("\r\n", ["AAA\nBBB\nCCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]),
("\r", ["AAA\nBBB\nCCC\r", "DDD\r", "EEE\r", "\nFFF\r", "\nGGG"]), ("\r", ["AAA\nBBB\nCCC\r", "DDD\r", "EEE\r", "\nFFF\r", "\nGGG"]),
]: ]:
buf = io.BytesIO(testdata) buf = io.BytesIO(testdata)
txt = io.TextIOWrapper(buf, encoding="ASCII", newline=newline) txt = io.TextIOWrapper(buf, encoding="ascii", newline=newline)
self.assertEquals(txt.readlines(), expected) self.assertEquals(txt.readlines(), expected)
txt.seek(0) txt.seek(0)
self.assertEquals(txt.read(), "".join(expected)) self.assertEquals(txt.read(), "".join(expected))
@ -518,7 +522,7 @@ class TextIOWrapperTest(unittest.TestCase):
tests = [(None, testdict[os.linesep])] + sorted(testdict.items()) tests = [(None, testdict[os.linesep])] + sorted(testdict.items())
for newline, expected in tests: for newline, expected in tests:
buf = io.BytesIO() buf = io.BytesIO()
txt = io.TextIOWrapper(buf, encoding="ASCII", newline=newline) txt = io.TextIOWrapper(buf, encoding="ascii", newline=newline)
txt.write("AAA\nB") txt.write("AAA\nB")
txt.write("BB\nCCC\n") txt.write("BB\nCCC\n")
txt.write("X\rY\r\nZ") txt.write("X\rY\r\nZ")
@ -568,14 +572,14 @@ class TextIOWrapperTest(unittest.TestCase):
testdata = b"AAA\nBBB\nCCC\rDDD\rEEE\r\nFFF\r\nGGG" testdata = b"AAA\nBBB\nCCC\rDDD\rEEE\r\nFFF\r\nGGG"
normalized = testdata.replace(b"\r\n", b"\n").replace(b"\r", b"\n") normalized = testdata.replace(b"\r\n", b"\n").replace(b"\r", b"\n")
for newline, expected in [ for newline, expected in [
(None, normalized.decode("ASCII").splitlines(True)), (None, normalized.decode("ascii").splitlines(True)),
("", testdata.decode("ASCII").splitlines(True)), ("", testdata.decode("ascii").splitlines(True)),
("\n", ["AAA\n", "BBB\n", "CCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]), ("\n", ["AAA\n", "BBB\n", "CCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]),
("\r\n", ["AAA\nBBB\nCCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]), ("\r\n", ["AAA\nBBB\nCCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]),
("\r", ["AAA\nBBB\nCCC\r", "DDD\r", "EEE\r", "\nFFF\r", "\nGGG"]), ("\r", ["AAA\nBBB\nCCC\r", "DDD\r", "EEE\r", "\nFFF\r", "\nGGG"]),
]: ]:
buf = io.BytesIO(testdata) buf = io.BytesIO(testdata)
txt = io.TextIOWrapper(buf, encoding="ASCII", newline=newline) txt = io.TextIOWrapper(buf, encoding="ascii", newline=newline)
self.assertEquals(txt.readlines(), expected) self.assertEquals(txt.readlines(), expected)
txt.seek(0) txt.seek(0)
self.assertEquals(txt.read(), "".join(expected)) self.assertEquals(txt.read(), "".join(expected))
@ -600,7 +604,7 @@ class TextIOWrapperTest(unittest.TestCase):
("\r\n", "\r\n", data_crlf), ("\r\n", "\r\n", data_crlf),
]: ]:
buf = io.BytesIO() buf = io.BytesIO()
txt = io.TextIOWrapper(buf, encoding="ASCII", newline=newline) txt = io.TextIOWrapper(buf, encoding="ascii", newline=newline)
txt.write(data) txt.write(data)
txt.close() txt.close()
self.assertEquals(buf.getvalue(), expected) self.assertEquals(buf.getvalue(), expected)
@ -745,6 +749,114 @@ class TextIOWrapperTest(unittest.TestCase):
print("Reading using readline(): %6.3f seconds" % (t3-t2)) print("Reading using readline(): %6.3f seconds" % (t3-t2))
print("Using readline()+tell(): %6.3f seconds" % (t4-t3)) print("Using readline()+tell(): %6.3f seconds" % (t4-t3))
def testReadOneByOne(self):
txt = io.TextIOWrapper(io.BytesIO(b"AA\r\nBB"))
reads = ""
while True:
c = txt.read(1)
if not c:
break
reads += c
self.assertEquals(reads, "AA\nBB")
# read in amounts equal to TextIOWrapper._CHUNK_SIZE which is 128.
def testReadByChunk(self):
# make sure "\r\n" straddles 128 char boundary.
txt = io.TextIOWrapper(io.BytesIO(b"A" * 127 + b"\r\nB"))
reads = ""
while True:
c = txt.read(128)
if not c:
break
reads += c
self.assertEquals(reads, "A"*127+"\nB")
def test_issue1395_1(self):
txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
# read one char at a time
reads = ""
while True:
c = txt.read(1)
if not c:
break
reads += c
self.assertEquals(reads, self.normalized)
def test_issue1395_2(self):
txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
txt._CHUNK_SIZE = 4
reads = ""
while True:
c = txt.read(4)
if not c:
break
reads += c
self.assertEquals(reads, self.normalized)
def test_issue1395_3(self):
txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
txt._CHUNK_SIZE = 4
reads = txt.read(4)
reads += txt.read(4)
reads += txt.readline()
reads += txt.readline()
reads += txt.readline()
self.assertEquals(reads, self.normalized)
def test_issue1395_4(self):
txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
txt._CHUNK_SIZE = 4
reads = txt.read(4)
reads += txt.read()
self.assertEquals(reads, self.normalized)
def test_issue1395_5(self):
txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
txt._CHUNK_SIZE = 4
reads = txt.read(4)
pos = txt.tell()
txt.seek(0)
txt.seek(pos)
self.assertEquals(txt.read(4), "BBB\n")
def test_newline_decoder(self):
import codecs
decoder = codecs.getincrementaldecoder("utf-8")()
decoder = io.IncrementalNewlineDecoder(decoder, translate=True)
self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), "\u8888")
self.assertEquals(decoder.decode(b'\xe8'), "")
self.assertEquals(decoder.decode(b'\xa2'), "")
self.assertEquals(decoder.decode(b'\x88'), "\u8888")
self.assertEquals(decoder.decode(b'\xe8'), "")
self.assertRaises(UnicodeDecodeError, decoder.decode, b'', final=True)
decoder.setstate((b'', 0))
self.assertEquals(decoder.decode(b'\n'), "\n")
self.assertEquals(decoder.decode(b'\r'), "")
self.assertEquals(decoder.decode(b'', final=True), "\n")
self.assertEquals(decoder.decode(b'\r', final=True), "\n")
self.assertEquals(decoder.decode(b'\r'), "")
self.assertEquals(decoder.decode(b'a'), "\na")
self.assertEquals(decoder.decode(b'\r\r\n'), "\n\n")
self.assertEquals(decoder.decode(b'\r'), "")
self.assertEquals(decoder.decode(b'\r'), "\n")
self.assertEquals(decoder.decode(b'\na'), "\na")
self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r\n'), "\u8888\n")
self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), "\u8888")
self.assertEquals(decoder.decode(b'\n'), "\n")
self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r'), "\u8888")
self.assertEquals(decoder.decode(b'\n'), "\n")
# XXX Tests for open() # XXX Tests for open()

View File

@ -867,7 +867,7 @@ static PyGetSetDef fileio_getsetlist[] = {
PyTypeObject PyFileIO_Type = { PyTypeObject PyFileIO_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0) PyVarObject_HEAD_INIT(&PyType_Type, 0)
"FileIO", "_FileIO",
sizeof(PyFileIOObject), sizeof(PyFileIOObject),
0, 0,
(destructor)fileio_dealloc, /* tp_dealloc */ (destructor)fileio_dealloc, /* tp_dealloc */