Merged revisions 77798 via svnmerge from

svn+ssh://pythondev@svn.python.org/python/trunk

........
  r77798 | antoine.pitrou | 2010-01-27 21:59:50 +0100 (mer., 27 janv. 2010) | 8 lines

  Issue #7610: Reworked implementation of the internal
  :class:`zipfile.ZipExtFile` class used to represent files stored inside
  an archive.  The new implementation is significantly faster and can
  be wrapped in a :class:`io.BufferedReader` object for more speedups.
  It also solves an issue where interleaved calls to `read()` and
  `readline()` give wrong results.  Patch by Nir Aides.
........
This commit is contained in:
Antoine Pitrou 2010-01-27 21:18:57 +00:00
parent 176d6c40e0
commit a32f9a24df
3 changed files with 226 additions and 176 deletions

View File

@ -168,6 +168,45 @@ class TestsWithSourceFile(unittest.TestCase):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.zip_random_open_test(f, zipfile.ZIP_STORED) self.zip_random_open_test(f, zipfile.ZIP_STORED)
def test_univeral_readaheads(self):
f = io.BytesIO()
data = b'a\r\n' * 16 * 1024
zipfp = zipfile.ZipFile(f, 'w', zipfile.ZIP_STORED)
zipfp.writestr(TESTFN, data)
zipfp.close()
data2 = b''
zipfp = zipfile.ZipFile(f, 'r')
zipopen = zipfp.open(TESTFN, 'rU')
for line in zipopen:
data2 += line
zipfp.close()
self.assertEqual(data, data2.replace(b'\n', b'\r\n'))
def zip_readline_read_test(self, f, compression):
self.make_test_archive(f, compression)
# Read the ZIP archive
zipfp = zipfile.ZipFile(f, "r")
zipopen = zipfp.open(TESTFN)
data = b''
while True:
read = zipopen.readline()
if not read:
break
data += read
read = zipopen.read(100)
if not read:
break
data += read
self.assertEqual(data, self.data)
zipfp.close()
def zip_readline_test(self, f, compression): def zip_readline_test(self, f, compression):
self.make_test_archive(f, compression) self.make_test_archive(f, compression)
@ -195,6 +234,11 @@ class TestsWithSourceFile(unittest.TestCase):
for line, zipline in zip(self.line_gen, zipfp.open(TESTFN)): for line, zipline in zip(self.line_gen, zipfp.open(TESTFN)):
self.assertEqual(zipline, line + '\n') self.assertEqual(zipline, line + '\n')
def test_readline_read_stored(self):
# Issue #7610: calls to readline() interleaved with calls to read().
for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.zip_readline_read_test(f, zipfile.ZIP_STORED)
def test_readline_stored(self): def test_readline_stored(self):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.zip_readline_test(f, zipfile.ZIP_STORED) self.zip_readline_test(f, zipfile.ZIP_STORED)
@ -223,6 +267,12 @@ class TestsWithSourceFile(unittest.TestCase):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.zip_random_open_test(f, zipfile.ZIP_DEFLATED) self.zip_random_open_test(f, zipfile.ZIP_DEFLATED)
@skipUnless(zlib, "requires zlib")
def test_readline_read_deflated(self):
# Issue #7610: calls to readline() interleaved with calls to read().
for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.zip_readline_read_test(f, zipfile.ZIP_DEFLATED)
@skipUnless(zlib, "requires zlib") @skipUnless(zlib, "requires zlib")
def test_readline_deflated(self): def test_readline_deflated(self):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
@ -1067,6 +1117,29 @@ class UniversalNewlineTests(unittest.TestCase):
zipdata = zipfp.open(fn, "rU").read() zipdata = zipfp.open(fn, "rU").read()
self.assertEqual(self.arcdata[sep], zipdata) self.assertEqual(self.arcdata[sep], zipdata)
def readline_read_test(self, f, compression):
self.make_test_archive(f, compression)
# Read the ZIP archive
zipfp = zipfile.ZipFile(f, "r")
for sep, fn in self.arcfiles.items():
zipopen = zipfp.open(fn, "rU")
data = b''
while True:
read = zipopen.readline()
if not read:
break
data += read
read = zipopen.read(5)
if not read:
break
data += read
self.assertEqual(data, self.arcdata['\n'])
zipfp.close()
def readline_test(self, f, compression): def readline_test(self, f, compression):
self.make_test_archive(f, compression) self.make_test_archive(f, compression)
@ -1101,6 +1174,11 @@ class UniversalNewlineTests(unittest.TestCase):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.read_test(f, zipfile.ZIP_STORED) self.read_test(f, zipfile.ZIP_STORED)
def test_readline_read_stored(self):
# Issue #7610: calls to readline() interleaved with calls to read().
for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.readline_read_test(f, zipfile.ZIP_STORED)
def test_readline_stored(self): def test_readline_stored(self):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.readline_test(f, zipfile.ZIP_STORED) self.readline_test(f, zipfile.ZIP_STORED)
@ -1118,6 +1196,12 @@ class UniversalNewlineTests(unittest.TestCase):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.read_test(f, zipfile.ZIP_DEFLATED) self.read_test(f, zipfile.ZIP_DEFLATED)
@skipUnless(zlib, "requires zlib")
def test_readline_read_deflated(self):
# Issue #7610: calls to readline() interleaved with calls to read().
for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.readline_read_test(f, zipfile.ZIP_DEFLATED)
@skipUnless(zlib, "requires zlib") @skipUnless(zlib, "requires zlib")
def test_readline_deflated(self): def test_readline_deflated(self):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):

View File

@ -5,6 +5,8 @@ XXX references to utf-8 need further investigation.
""" """
import struct, os, time, sys, shutil import struct, os, time, sys, shutil
import binascii, io, stat import binascii, io, stat
import io
import re
try: try:
import zlib # We may need its compression method import zlib # We may need its compression method
@ -443,205 +445,172 @@ class _ZipDecrypter:
self._UpdateKeys(c) self._UpdateKeys(c)
return c return c
class ZipExtFile: class ZipExtFile(io.BufferedIOBase):
"""File-like object for reading an archive member. """File-like object for reading an archive member.
Is returned by ZipFile.open(). Is returned by ZipFile.open().
""" """
def __init__(self, fileobj, zipinfo, decrypt=None): # Max size supported by decompressor.
self.fileobj = fileobj MAX_N = 1 << 31 - 1
self.decrypter = decrypt
self.bytes_read = 0
self.rawbuffer = b''
self.readbuffer = b''
self.linebuffer = b''
self.eof = False
self.univ_newlines = False
self.nlSeps = (b"\n", )
self.lastdiscard = b''
self.compress_type = zipinfo.compress_type # Read from compressed files in 4k blocks.
self.compress_size = zipinfo.compress_size MIN_READ_SIZE = 4096
self.closed = False # Search for universal newlines or line chunks.
self.mode = "r" PATTERN = re.compile(br'^(?P<chunk>[^\r\n]+)|(?P<newline>\n|\r\n?)')
def __init__(self, fileobj, mode, zipinfo, decrypter=None):
self._fileobj = fileobj
self._decrypter = decrypter
self._decompressor = zlib.decompressobj(-15)
self._unconsumed = b''
self._readbuffer = b''
self._offset = 0
self._universal = 'U' in mode
self.newlines = None
self._compress_type = zipinfo.compress_type
self._compress_size = zipinfo.compress_size
self._compress_left = zipinfo.compress_size
# Adjust read size for encrypted files since the first 12 bytes
# are for the encryption/password information.
if self._decrypter is not None:
self._compress_left -= 12
self.mode = mode
self.name = zipinfo.filename self.name = zipinfo.filename
# read from compressed files in 64k blocks def readline(self, limit=-1):
self.compreadsize = 64*1024 """Read and return a line from the stream.
if self.compress_type == ZIP_DEFLATED:
self.dc = zlib.decompressobj(-15)
def set_univ_newlines(self, univ_newlines): If limit is specified, at most limit bytes will be read.
self.univ_newlines = univ_newlines
# pick line separator char(s) based on universal newlines flag
self.nlSeps = (b"\n", )
if self.univ_newlines:
self.nlSeps = (b"\r\n", b"\r", b"\n")
def __iter__(self):
return self
def __next__(self):
nextline = self.readline()
if not nextline:
raise StopIteration()
return nextline
def close(self):
self.closed = True
def _checkfornewline(self):
nl, nllen = -1, -1
if self.linebuffer:
# ugly check for cases where half of an \r\n pair was
# read on the last pass, and the \r was discarded. In this
# case we just throw away the \n at the start of the buffer.
if (self.lastdiscard, self.linebuffer[:1]) == (b'\r', b'\n'):
self.linebuffer = self.linebuffer[1:]
for sep in self.nlSeps:
nl = self.linebuffer.find(sep)
if nl >= 0:
nllen = len(sep)
return nl, nllen
return nl, nllen
def readline(self, size = -1):
"""Read a line with approx. size. If size is negative,
read a whole line.
""" """
if size < 0:
size = sys.maxsize
elif size == 0:
return b''
# check for a newline already in buffer if not self._universal and limit < 0:
nl, nllen = self._checkfornewline() # Shortcut common case - newline found in buffer.
i = self._readbuffer.find(b'\n', self._offset) + 1
if i > 0:
line = self._readbuffer[self._offset: i]
self._offset = i
return line
if nl >= 0: if not self._universal:
# the next line was already in the buffer return io.BufferedIOBase.readline(self, limit)
nl = min(nl, size)
else:
# no line break in buffer - try to read more
size -= len(self.linebuffer)
while nl < 0 and size > 0:
buf = self.read(min(size, 100))
if not buf:
break
self.linebuffer += buf
size -= len(buf)
# check for a newline in buffer line = b''
nl, nllen = self._checkfornewline() while limit < 0 or len(line) < limit:
readahead = self.peek(2)
if readahead == b'':
return line
# we either ran out of bytes in the file, or #
# met the specified size limit without finding a newline, # Search for universal newlines or line chunks.
# so return current buffer #
if nl < 0: # The pattern returns either a line chunk or a newline, but not
s = self.linebuffer # both. Combined with peek(2), we are assured that the sequence
self.linebuffer = b'' # '\r\n' is always retrieved completely and never split into
return s # separate newlines - '\r', '\n' due to coincidental readaheads.
#
match = self.PATTERN.search(readahead)
newline = match.group('newline')
if newline is not None:
if self.newlines is None:
self.newlines = []
if newline not in self.newlines:
self.newlines.append(newline)
self._offset += len(newline)
return line + b'\n'
buf = self.linebuffer[:nl] chunk = match.group('chunk')
self.lastdiscard = self.linebuffer[nl:nl + nllen] if limit >= 0:
self.linebuffer = self.linebuffer[nl + nllen:] chunk = chunk[: limit - len(line)]
# line is always returned with \n as newline char (except possibly self._offset += len(chunk)
# for a final incomplete line in the file, which is handled above). line += chunk
return buf + b"\n"
def readlines(self, sizehint = -1): return line
"""Return a list with all (following) lines. The sizehint parameter
is ignored in this implementation. def peek(self, n=1):
"""Returns buffered bytes without advancing the position."""
if n > len(self._readbuffer) - self._offset:
chunk = self.read(n)
self._offset -= len(chunk)
# Return up to 512 bytes to reduce allocation overhead for tight loops.
return self._readbuffer[self._offset: self._offset + 512]
def readable(self):
return True
def read(self, n=-1):
"""Read and return up to n bytes.
If the argument is omitted, None, or negative, data is read and returned until EOF is reached..
""" """
result = []
while True:
line = self.readline()
if not line: break
result.append(line)
return result
def read(self, size = None): buf = b''
# act like file obj and return empty string if size is 0 while n < 0 or n is None or n > len(buf):
if size == 0: data = self.read1(n)
return b'' if len(data) == 0:
return buf
# determine read size buf += data
bytesToRead = self.compress_size - self.bytes_read
# adjust read size for encrypted files since the first 12 bytes return buf
# are for the encryption/password information
if self.decrypter is not None:
bytesToRead -= 12
if size is not None and size >= 0: def read1(self, n):
if self.compress_type == ZIP_STORED: """Read up to n bytes with at most one read() system call."""
lr = len(self.readbuffer)
bytesToRead = min(bytesToRead, size - lr)
elif self.compress_type == ZIP_DEFLATED:
if len(self.readbuffer) > size:
# the user has requested fewer bytes than we've already
# pulled through the decompressor; don't read any more
bytesToRead = 0
else:
# user will use up the buffer, so read some more
lr = len(self.rawbuffer)
bytesToRead = min(bytesToRead, self.compreadsize - lr)
# avoid reading past end of file contents # Simplify algorithm (branching) by transforming negative n to large n.
if bytesToRead + self.bytes_read > self.compress_size: if n < 0 or n is None:
bytesToRead = self.compress_size - self.bytes_read n = self.MAX_N
# try to read from file (if necessary) # Bytes available in read buffer.
if bytesToRead > 0: len_readbuffer = len(self._readbuffer) - self._offset
data = self.fileobj.read(bytesToRead)
self.bytes_read += len(data)
try:
self.rawbuffer += data
except:
print(repr(self.fileobj), repr(self.rawbuffer),
repr(data))
raise
# handle contents of raw buffer # Read from file.
if self.rawbuffer: if self._compress_left > 0 and n > len_readbuffer + len(self._unconsumed):
newdata = self.rawbuffer nbytes = n - len_readbuffer - len(self._unconsumed)
self.rawbuffer = b'' nbytes = max(nbytes, self.MIN_READ_SIZE)
nbytes = min(nbytes, self._compress_left)
# decrypt new data if we were given an object to handle that data = self._fileobj.read(nbytes)
if newdata and self.decrypter is not None: self._compress_left -= len(data)
newdata = bytes(map(self.decrypter, newdata))
# decompress newly read data if necessary if data and self._decrypter is not None:
if newdata and self.compress_type == ZIP_DEFLATED: data = bytes(map(self._decrypter, data))
newdata = self.dc.decompress(newdata)
self.rawbuffer = self.dc.unconsumed_tail
if self.eof and len(self.rawbuffer) == 0:
# we're out of raw bytes (both from the file and
# the local buffer); flush just to make sure the
# decompressor is done
newdata += self.dc.flush()
# prevent decompressor from being used again
self.dc = None
self.readbuffer += newdata if self._compress_type == ZIP_STORED:
self._readbuffer = self._readbuffer[self._offset:] + data
self._offset = 0
else:
# Prepare deflated bytes for decompression.
self._unconsumed += data
# Handle unconsumed data.
if len(self._unconsumed) > 0 and n > len_readbuffer:
data = self._decompressor.decompress(
self._unconsumed,
max(n - len_readbuffer, self.MIN_READ_SIZE)
)
# return what the user asked for self._unconsumed = self._decompressor.unconsumed_tail
if size is None or len(self.readbuffer) <= size: if len(self._unconsumed) == 0 and self._compress_left == 0:
data = self.readbuffer data += self._decompressor.flush()
self.readbuffer = b''
else:
data = self.readbuffer[:size]
self.readbuffer = self.readbuffer[size:]
self._readbuffer = self._readbuffer[self._offset:] + data
self._offset = 0
# Read from buffer.
data = self._readbuffer[self._offset: self._offset + n]
self._offset += len(data)
return data return data
class ZipFile: class ZipFile:
""" Class with methods to open, read, write, close, list zip files. """ Class with methods to open, read, write, close, list zip files.
@ -925,16 +894,7 @@ class ZipFile:
if h[11] != check_byte: if h[11] != check_byte:
raise RuntimeError("Bad password for file", name) raise RuntimeError("Bad password for file", name)
# build and return a ZipExtFile return ZipExtFile(zef_file, mode, zinfo, zd)
if zd is None:
zef = ZipExtFile(zef_file, zinfo)
else:
zef = ZipExtFile(zef_file, zinfo, zd)
# set universal newlines on ZipExtFile if necessary
if "U" in mode:
zef.set_univ_newlines(True)
return zef
def extract(self, member, path=None, pwd=None): def extract(self, member, path=None, pwd=None):
"""Extract a member from the archive to the current working directory, """Extract a member from the archive to the current working directory,

View File

@ -225,7 +225,6 @@ C-API
- Issue #1419652: Change the first argument to PyImport_AppendInittab() to - Issue #1419652: Change the first argument to PyImport_AppendInittab() to
``const char *`` as the string is stored beyond the call. ``const char *`` as the string is stored beyond the call.
- Issue #2422: When compiled with the ``--with-valgrind`` option, the - Issue #2422: When compiled with the ``--with-valgrind`` option, the
pymalloc allocator will be automatically disabled when running under pymalloc allocator will be automatically disabled when running under
Valgrind. This gives improved memory leak detection when running Valgrind. This gives improved memory leak detection when running
@ -234,6 +233,13 @@ C-API
Library Library
------- -------
- Issue #7610: Reworked implementation of the internal
:class:`zipfile.ZipExtFile` class used to represent files stored inside
an archive. The new implementation is significantly faster and can
be wrapped in a :class:`io.BufferedReader` object for more speedups.
It also solves an issue where interleaved calls to `read()` and
`readline()` give wrong results. Patch by Nir Aides.
- Issue #6963: Added "maxtasksperchild" argument to multiprocessing.Pool, - Issue #6963: Added "maxtasksperchild" argument to multiprocessing.Pool,
allowing for a maximum number of tasks within the pool to be completed by allowing for a maximum number of tasks within the pool to be completed by
the worker before that worker is terminated, and a new one created to the worker before that worker is terminated, and a new one created to