Issue1395: Universal mode used to duplicate newlines when using read(1).

"Universal newline" is now an incremental decoder wrapping the initial one,
with its own additional buffer (if '\r' is seen at the end of the input).

A decoder allows the tell() funtion to record the state of the translation.
This also simplifies the readline() process.

Now test_netrc passes on Windows, as well as many new tests in test_io.py
This commit is contained in:
Amaury Forgeot d'Arc 2007-11-19 20:34:10 +00:00
parent 74c29c71b1
commit 1ff9910f59
3 changed files with 237 additions and 98 deletions

197
Lib/io.py
View File

@ -1041,6 +1041,84 @@ class TextIOBase(IOBase):
return None
class IncrementalNewlineDecoder(codecs.IncrementalDecoder):
"""Codec used when reading a file in universal newlines mode.
It wraps another incremental decoder, translating \\r\\n and \\r into \\n.
It also records the types of newlines encountered.
When used with translate=False, it ensures that the newline sequence is
returned in one piece.
"""
def __init__(self, decoder, translate, errors='strict'):
codecs.IncrementalDecoder.__init__(self, errors=errors)
self.buffer = b''
self.translate = translate
self.decoder = decoder
self.seennl = 0
def decode(self, input, final=False):
# decode input (with the eventual \r from a previous pass)
if self.buffer:
input = self.buffer + input
output = self.decoder.decode(input, final=final)
# retain last \r even when not translating data:
# then readline() is sure to get \r\n in one pass
if output.endswith("\r") and not final:
output = output[:-1]
self.buffer = b'\r'
else:
self.buffer = b''
# Record which newlines are read
crlf = output.count('\r\n')
cr = output.count('\r') - crlf
lf = output.count('\n') - crlf
self.seennl |= (lf and self._LF) | (cr and self._CR) \
| (crlf and self._CRLF)
if self.translate:
if crlf:
output = output.replace("\r\n", "\n")
if cr:
output = output.replace("\r", "\n")
return output
def getstate(self):
buf, flag = self.decoder.getstate()
return buf + self.buffer, flag
def setstate(self, state):
buf, flag = state
if buf.endswith(b'\r'):
self.buffer = b'\r'
buf = buf[:-1]
else:
self.buffer = b''
self.decoder.setstate((buf, flag))
def reset(self):
self.buffer = b''
self.decoder.reset()
_LF = 1
_CR = 2
_CRLF = 4
@property
def newlines(self):
return (None,
"\n",
"\r",
("\r", "\n"),
"\r\n",
("\n", "\r\n"),
("\r", "\r\n"),
("\r", "\n", "\r\n")
)[self.seennl]
class TextIOWrapper(TextIOBase):
"""Buffered text stream.
@ -1077,7 +1155,6 @@ class TextIOWrapper(TextIOBase):
self._readnl = newline
self._writetranslate = newline != ''
self._writenl = newline or os.linesep
self._seennl = 0
self._decoder = None
self._pending = ""
self._snapshot = None
@ -1124,6 +1201,7 @@ class TextIOWrapper(TextIOBase):
if not isinstance(s, str):
raise TypeError("can't write %s to text stream" %
s.__class__.__name__)
length = len(s)
haslf = "\n" in s
if haslf and self._writetranslate and self._writenl != "\n":
s = s.replace("\n", self._writenl)
@ -1132,15 +1210,20 @@ class TextIOWrapper(TextIOBase):
self.buffer.write(b)
if haslf and self.isatty():
self.flush()
self._snapshot = self._decoder = None
return len(s)
self._snapshot = None
if self._decoder:
self._decoder.reset()
return length
def _get_decoder(self):
make_decoder = codecs.getincrementaldecoder(self._encoding)
if make_decoder is None:
raise IOError("Can't find an incremental decoder for encoding %s" %
self._encoding)
decoder = self._decoder = make_decoder() # XXX: errors
decoder = make_decoder() # XXX: errors
if self._readuniversal:
decoder = IncrementalNewlineDecoder(decoder, self._readtranslate)
self._decoder = decoder
return decoder
def _read_chunk(self):
@ -1220,7 +1303,8 @@ class TextIOWrapper(TextIOBase):
pos = self.buffer.seek(0, 2)
self._snapshot = None
self._pending = ""
self._decoder = None
if self._decoder:
self._decoder.reset()
return pos
if whence != 0:
raise ValueError("Invalid whence (%r, should be 0, 1 or 2)" %
@ -1234,7 +1318,8 @@ class TextIOWrapper(TextIOBase):
self.buffer.seek(pos)
self._snapshot = None
self._pending = ""
self._decoder = None
if self._decoder:
self._decoder.reset()
return pos
decoder = self._decoder or self._get_decoder()
decoder.set_state(("", ds))
@ -1253,7 +1338,7 @@ class TextIOWrapper(TextIOBase):
res += decoder.decode(self.buffer.read(), True)
self._pending = ""
self._snapshot = None
return self._replacenl(res)
return res
else:
while len(res) < n:
readahead, pending = self._read_chunk()
@ -1261,7 +1346,7 @@ class TextIOWrapper(TextIOBase):
if not readahead:
break
self._pending = res[n:]
return self._replacenl(res[:n])
return res[:n]
def __next__(self):
self._telling = False
@ -1285,62 +1370,55 @@ class TextIOWrapper(TextIOBase):
line = self._pending
start = 0
cr_eof = False
decoder = self._decoder or self._get_decoder()
pos = endpos = None
ending = None
while True:
if self._readuniversal:
if self._readtranslate:
# Newlines are already translated, only search for \n
pos = line.find('\n', start)
if pos >= 0:
endpos = pos + 1
break
else:
start = len(line)
elif self._readuniversal:
# Universal newline search. Find any of \r, \r\n, \n
# The decoder ensures that \r\n are not split in two pieces
# In C we'd look for these in parallel of course.
nlpos = line.find("\n", start)
crpos = line.find("\r", start)
if crpos == -1:
if nlpos == -1:
# Nothing found
start = len(line)
else:
# Found \n
pos = nlpos
endpos = pos + 1
ending = self._LF
endpos = nlpos + 1
break
elif nlpos == -1:
if crpos == len(line) - 1:
# Found \r at end of buffer, must keep reading
start = crpos
cr_eof = True
else:
# Found lone \r
ending = self._CR
pos = crpos
endpos = pos + 1
endpos = crpos + 1
break
elif nlpos < crpos:
# Found \n
pos = nlpos
endpos = pos + 1
ending = self._LF
endpos = nlpos + 1
break
elif nlpos == crpos + 1:
# Found \r\n
ending = self._CRLF
pos = crpos
endpos = pos + 2
endpos = crpos + 2
break
else:
# Found \r
pos = crpos
endpos = pos + 1
ending = self._CR
endpos = crpos + 1
break
else:
# non-universal
pos = line.find(self._readnl)
if pos >= 0:
endpos = pos+len(self._readnl)
ending = self._nlflag(self._readnl)
endpos = pos + len(self._readnl)
break
# No line ending seen yet - get more data
@ -1356,65 +1434,14 @@ class TextIOWrapper(TextIOBase):
# end of file
self._pending = ''
self._snapshot = None
if cr_eof:
self._seennl |= self._CR
return line[:-1] + '\n'
else:
return line
self._pending = line[endpos:]
if self._readtranslate:
self._seennl |= ending
if ending != self._LF:
return line[:pos] + '\n'
else:
return line[:endpos]
else:
return line[:endpos]
def _replacenl(self, data):
# Replace newlines in data as needed and record that they have
# been seen.
if not self._readtranslate:
return data
if self._readuniversal:
crlf = data.count('\r\n')
cr = data.count('\r') - crlf
lf = data.count('\n') - crlf
self._seennl |= (lf and self._LF) | (cr and self._CR) \
| (crlf and self._CRLF)
if crlf:
data = data.replace("\r\n", "\n")
if cr:
data = data.replace("\r", "\n")
elif self._readnl == '\n':
# Only need to detect if \n was seen.
if data.count('\n'):
self._seennl |= self._LF
else:
newdata = data.replace(self._readnl, '\n')
if newdata is not data:
self._seennl |= self._nlflag(self._readnl)
data = newdata
return data
_LF = 1
_CR = 2
_CRLF = 4
@property
def newlines(self):
return (None,
"\n",
"\r",
("\r", "\n"),
"\r\n",
("\n", "\r\n"),
("\r", "\r\n"),
("\r", "\n", "\r\n")
)[self._seennl]
def _nlflag(self, nlstr):
return [None, "\n", "\r", None, "\r\n"].index(nlstr)
return self._decoder.newlines if self._decoder else None
class StringIO(TextIOWrapper):

View File

@ -489,6 +489,10 @@ class BufferedRandomTest(unittest.TestCase):
class TextIOWrapperTest(unittest.TestCase):
def setUp(self):
self.testdata = b"AAA\r\nBBB\rCCC\r\nDDD\nEEE\r\n"
self.normalized = b"AAA\nBBB\nCCC\nDDD\nEEE\n".decode("ascii")
def tearDown(self):
test_support.unlink(test_support.TESTFN)
@ -496,14 +500,14 @@ class TextIOWrapperTest(unittest.TestCase):
testdata = b"AAA\nBBB\nCCC\rDDD\rEEE\r\nFFF\r\nGGG"
normalized = testdata.replace(b"\r\n", b"\n").replace(b"\r", b"\n")
for newline, expected in [
(None, normalized.decode("ASCII").splitlines(True)),
("", testdata.decode("ASCII").splitlines(True)),
(None, normalized.decode("ascii").splitlines(True)),
("", testdata.decode("ascii").splitlines(True)),
("\n", ["AAA\n", "BBB\n", "CCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]),
("\r\n", ["AAA\nBBB\nCCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]),
("\r", ["AAA\nBBB\nCCC\r", "DDD\r", "EEE\r", "\nFFF\r", "\nGGG"]),
]:
buf = io.BytesIO(testdata)
txt = io.TextIOWrapper(buf, encoding="ASCII", newline=newline)
txt = io.TextIOWrapper(buf, encoding="ascii", newline=newline)
self.assertEquals(txt.readlines(), expected)
txt.seek(0)
self.assertEquals(txt.read(), "".join(expected))
@ -518,7 +522,7 @@ class TextIOWrapperTest(unittest.TestCase):
tests = [(None, testdict[os.linesep])] + sorted(testdict.items())
for newline, expected in tests:
buf = io.BytesIO()
txt = io.TextIOWrapper(buf, encoding="ASCII", newline=newline)
txt = io.TextIOWrapper(buf, encoding="ascii", newline=newline)
txt.write("AAA\nB")
txt.write("BB\nCCC\n")
txt.write("X\rY\r\nZ")
@ -568,14 +572,14 @@ class TextIOWrapperTest(unittest.TestCase):
testdata = b"AAA\nBBB\nCCC\rDDD\rEEE\r\nFFF\r\nGGG"
normalized = testdata.replace(b"\r\n", b"\n").replace(b"\r", b"\n")
for newline, expected in [
(None, normalized.decode("ASCII").splitlines(True)),
("", testdata.decode("ASCII").splitlines(True)),
(None, normalized.decode("ascii").splitlines(True)),
("", testdata.decode("ascii").splitlines(True)),
("\n", ["AAA\n", "BBB\n", "CCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]),
("\r\n", ["AAA\nBBB\nCCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]),
("\r", ["AAA\nBBB\nCCC\r", "DDD\r", "EEE\r", "\nFFF\r", "\nGGG"]),
]:
buf = io.BytesIO(testdata)
txt = io.TextIOWrapper(buf, encoding="ASCII", newline=newline)
txt = io.TextIOWrapper(buf, encoding="ascii", newline=newline)
self.assertEquals(txt.readlines(), expected)
txt.seek(0)
self.assertEquals(txt.read(), "".join(expected))
@ -600,7 +604,7 @@ class TextIOWrapperTest(unittest.TestCase):
("\r\n", "\r\n", data_crlf),
]:
buf = io.BytesIO()
txt = io.TextIOWrapper(buf, encoding="ASCII", newline=newline)
txt = io.TextIOWrapper(buf, encoding="ascii", newline=newline)
txt.write(data)
txt.close()
self.assertEquals(buf.getvalue(), expected)
@ -745,6 +749,114 @@ class TextIOWrapperTest(unittest.TestCase):
print("Reading using readline(): %6.3f seconds" % (t3-t2))
print("Using readline()+tell(): %6.3f seconds" % (t4-t3))
def testReadOneByOne(self):
txt = io.TextIOWrapper(io.BytesIO(b"AA\r\nBB"))
reads = ""
while True:
c = txt.read(1)
if not c:
break
reads += c
self.assertEquals(reads, "AA\nBB")
# read in amounts equal to TextIOWrapper._CHUNK_SIZE which is 128.
def testReadByChunk(self):
# make sure "\r\n" straddles 128 char boundary.
txt = io.TextIOWrapper(io.BytesIO(b"A" * 127 + b"\r\nB"))
reads = ""
while True:
c = txt.read(128)
if not c:
break
reads += c
self.assertEquals(reads, "A"*127+"\nB")
def test_issue1395_1(self):
txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
# read one char at a time
reads = ""
while True:
c = txt.read(1)
if not c:
break
reads += c
self.assertEquals(reads, self.normalized)
def test_issue1395_2(self):
txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
txt._CHUNK_SIZE = 4
reads = ""
while True:
c = txt.read(4)
if not c:
break
reads += c
self.assertEquals(reads, self.normalized)
def test_issue1395_3(self):
txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
txt._CHUNK_SIZE = 4
reads = txt.read(4)
reads += txt.read(4)
reads += txt.readline()
reads += txt.readline()
reads += txt.readline()
self.assertEquals(reads, self.normalized)
def test_issue1395_4(self):
txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
txt._CHUNK_SIZE = 4
reads = txt.read(4)
reads += txt.read()
self.assertEquals(reads, self.normalized)
def test_issue1395_5(self):
txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
txt._CHUNK_SIZE = 4
reads = txt.read(4)
pos = txt.tell()
txt.seek(0)
txt.seek(pos)
self.assertEquals(txt.read(4), "BBB\n")
def test_newline_decoder(self):
import codecs
decoder = codecs.getincrementaldecoder("utf-8")()
decoder = io.IncrementalNewlineDecoder(decoder, translate=True)
self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), "\u8888")
self.assertEquals(decoder.decode(b'\xe8'), "")
self.assertEquals(decoder.decode(b'\xa2'), "")
self.assertEquals(decoder.decode(b'\x88'), "\u8888")
self.assertEquals(decoder.decode(b'\xe8'), "")
self.assertRaises(UnicodeDecodeError, decoder.decode, b'', final=True)
decoder.setstate((b'', 0))
self.assertEquals(decoder.decode(b'\n'), "\n")
self.assertEquals(decoder.decode(b'\r'), "")
self.assertEquals(decoder.decode(b'', final=True), "\n")
self.assertEquals(decoder.decode(b'\r', final=True), "\n")
self.assertEquals(decoder.decode(b'\r'), "")
self.assertEquals(decoder.decode(b'a'), "\na")
self.assertEquals(decoder.decode(b'\r\r\n'), "\n\n")
self.assertEquals(decoder.decode(b'\r'), "")
self.assertEquals(decoder.decode(b'\r'), "\n")
self.assertEquals(decoder.decode(b'\na'), "\na")
self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r\n'), "\u8888\n")
self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), "\u8888")
self.assertEquals(decoder.decode(b'\n'), "\n")
self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r'), "\u8888")
self.assertEquals(decoder.decode(b'\n'), "\n")
# XXX Tests for open()

View File

@ -867,7 +867,7 @@ static PyGetSetDef fileio_getsetlist[] = {
PyTypeObject PyFileIO_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
"FileIO",
"_FileIO",
sizeof(PyFileIOObject),
0,
(destructor)fileio_dealloc, /* tp_dealloc */