From 18ee29d0b870caddc0806916ca2c823254f1a1f9 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 13 May 2016 13:52:49 +0300 Subject: [PATCH] Issue #26039: zipfile.ZipFile.open() can now be used to write data into a ZIP file, as well as for extracting data. Patch by Thomas Kluyver. --- Doc/library/zipfile.rst | 42 +++--- Doc/whatsnew/3.6.rst | 4 + Lib/test/test_zipfile.py | 83 ++++++++++- Lib/zipfile.py | 290 +++++++++++++++++++++++++-------------- Misc/NEWS | 3 + 5 files changed, 295 insertions(+), 127 deletions(-) diff --git a/Doc/library/zipfile.rst b/Doc/library/zipfile.rst index 19b354daeef..cf8d54715f5 100644 --- a/Doc/library/zipfile.rst +++ b/Doc/library/zipfile.rst @@ -207,15 +207,15 @@ ZipFile Objects .. index:: single: universal newlines; zipfile.ZipFile.open method -.. method:: ZipFile.open(name, mode='r', pwd=None) +.. method:: ZipFile.open(name, mode='r', pwd=None, force_zip64=False) - Extract a member from the archive as a file-like object (ZipExtFile). *name* - is the name of the file in the archive, or a :class:`ZipInfo` object. The + Access a member of the archive as a file-like object. *name* + is the name of the file in the archive, or a :class:`ZipInfo` object. The *mode* parameter, if included, must be one of the following: ``'r'`` (the - default), ``'U'``, or ``'rU'``. Choosing ``'U'`` or ``'rU'`` will enable - :term:`universal newlines` support in the read-only object. *pwd* is the - password used for encrypted files. Calling :meth:`.open` on a closed - ZipFile will raise a :exc:`RuntimeError`. + default), ``'U'``, ``'rU'`` or ``'w'``. Choosing ``'U'`` or ``'rU'`` will + enable :term:`universal newlines` support in the read-only object. *pwd* is + the password used to decrypt encrypted ZIP files. Calling :meth:`.open` on + a closed ZipFile will raise a :exc:`RuntimeError`. :meth:`~ZipFile.open` is also a context manager and therefore supports the :keyword:`with` statement:: @@ -224,17 +224,23 @@ ZipFile Objects with myzip.open('eggs.txt') as myfile: print(myfile.read()) - .. note:: + With *mode* ``'r'``, ``'U'`` or ``'rU'``, the file-like object + (``ZipExtFile``) is read-only and provides the following methods: + :meth:`~io.BufferedIOBase.read`, :meth:`~io.IOBase.readline`, + :meth:`~io.IOBase.readlines`, :meth:`__iter__`, + :meth:`~iterator.__next__`. These objects can operate independently of + the ZipFile. - The file-like object is read-only and provides the following methods: - :meth:`~io.BufferedIOBase.read`, :meth:`~io.IOBase.readline`, - :meth:`~io.IOBase.readlines`, :meth:`__iter__`, - :meth:`~iterator.__next__`. + With ``mode='w'``, a writable file handle is returned, which supports the + :meth:`~io.BufferedIOBase.write` method. While a writable file handle is open, + attempting to read or write other files in the ZIP file will raise a + :exc:`RuntimeError`. - .. note:: - - Objects returned by :meth:`.open` can operate independently of the - ZipFile. + When writing a file, if the file size is not known in advance but may exceed + 2 GiB, pass ``force_zip64=True`` to ensure that the header format is + capable of supporting large files. If the file size is known in advance, + construct a :class:`ZipInfo` object with :attr:`~ZipInfo.file_size` set, and + use that as the *name* parameter. .. note:: @@ -246,6 +252,10 @@ ZipFile Objects The ``'U'`` or ``'rU'`` mode. Use :class:`io.TextIOWrapper` for reading compressed text files in :term:`universal newlines` mode. + .. versionchanged:: 3.6 + :meth:`open` can now be used to write files into the archive with the + ``mode='w'`` option. + .. method:: ZipFile.extract(member, path=None, pwd=None) Extract a member from the archive to the current working directory; *member* diff --git a/Doc/whatsnew/3.6.rst b/Doc/whatsnew/3.6.rst index be4c01409f4..384e35a4caf 100644 --- a/Doc/whatsnew/3.6.rst +++ b/Doc/whatsnew/3.6.rst @@ -350,6 +350,10 @@ A new :meth:`ZipInfo.is_dir() ` method can be used to check if the :class:`~zipfile.ZipInfo` instance represents a directory. (Contributed by Thomas Kluyver in :issue:`26039`.) +The :meth:`ZipFile.open() ` method can now be used to +write data into a ZIP file, as well as for extracting data. +(Contributed by Thomas Kluyver in :issue:`26039`.) + zlib ---- diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py index 61f561b0fae..b33995566cf 100644 --- a/Lib/test/test_zipfile.py +++ b/Lib/test/test_zipfile.py @@ -61,6 +61,9 @@ class AbstractTestsWithSourceFile: zipfp.write(TESTFN, "another.name") zipfp.write(TESTFN, TESTFN) zipfp.writestr("strfile", self.data) + with zipfp.open('written-open-w', mode='w') as f: + for line in self.line_gen: + f.write(line) def zip_test(self, f, compression): self.make_test_archive(f, compression) @@ -76,7 +79,7 @@ class AbstractTestsWithSourceFile: zipfp.printdir(file=fp) directory = fp.getvalue() lines = directory.splitlines() - self.assertEqual(len(lines), 4) # Number of files + header + self.assertEqual(len(lines), 5) # Number of files + header self.assertIn('File Name', lines[0]) self.assertIn('Modified', lines[0]) @@ -90,23 +93,25 @@ class AbstractTestsWithSourceFile: # Check the namelist names = zipfp.namelist() - self.assertEqual(len(names), 3) + self.assertEqual(len(names), 4) self.assertIn(TESTFN, names) self.assertIn("another.name", names) self.assertIn("strfile", names) + self.assertIn("written-open-w", names) # Check infolist infos = zipfp.infolist() names = [i.filename for i in infos] - self.assertEqual(len(names), 3) + self.assertEqual(len(names), 4) self.assertIn(TESTFN, names) self.assertIn("another.name", names) self.assertIn("strfile", names) + self.assertIn("written-open-w", names) for i in infos: self.assertEqual(i.file_size, len(self.data)) # check getinfo - for nm in (TESTFN, "another.name", "strfile"): + for nm in (TESTFN, "another.name", "strfile", "written-open-w"): info = zipfp.getinfo(nm) self.assertEqual(info.filename, nm) self.assertEqual(info.file_size, len(self.data)) @@ -372,14 +377,18 @@ class StoredTestsWithSourceFile(AbstractTestsWithSourceFile, test_low_compression = None def zip_test_writestr_permissions(self, f, compression): - # Make sure that writestr creates files with mode 0600, - # when it is passed a name rather than a ZipInfo instance. + # Make sure that writestr and open(... mode='w') create files with + # mode 0600, when they are passed a name rather than a ZipInfo + # instance. self.make_test_archive(f, compression) with zipfile.ZipFile(f, "r") as zipfp: zinfo = zipfp.getinfo('strfile') self.assertEqual(zinfo.external_attr, 0o600 << 16) + zinfo2 = zipfp.getinfo('written-open-w') + self.assertEqual(zinfo2.external_attr, 0o600 << 16) + def test_writestr_permissions(self): for f in get_files(self): self.zip_test_writestr_permissions(f, zipfile.ZIP_STORED) @@ -451,6 +460,10 @@ class StoredTestsWithSourceFile(AbstractTestsWithSourceFile, with zipfile.ZipFile(TESTFN2, mode="r") as zipfp: self.assertRaises(RuntimeError, zipfp.write, TESTFN) + with zipfile.ZipFile(TESTFN2, mode="r") as zipfp: + with self.assertRaises(RuntimeError): + zipfp.open(TESTFN, mode='w') + def test_add_file_before_1980(self): # Set atime and mtime to 1970-01-01 os.utime(TESTFN, (0, 0)) @@ -1428,6 +1441,35 @@ class OtherTests(unittest.TestCase): # testzip returns the name of the first corrupt file, or None self.assertIsNone(zipf.testzip()) + def test_open_conflicting_handles(self): + # It's only possible to open one writable file handle at a time + msg1 = b"It's fun to charter an accountant!" + msg2 = b"And sail the wide accountant sea" + msg3 = b"To find, explore the funds offshore" + with zipfile.ZipFile(TESTFN2, 'w', zipfile.ZIP_STORED) as zipf: + with zipf.open('foo', mode='w') as w2: + w2.write(msg1) + with zipf.open('bar', mode='w') as w1: + with self.assertRaises(RuntimeError): + zipf.open('handle', mode='w') + with self.assertRaises(RuntimeError): + zipf.open('foo', mode='r') + with self.assertRaises(RuntimeError): + zipf.writestr('str', 'abcde') + with self.assertRaises(RuntimeError): + zipf.write(__file__, 'file') + with self.assertRaises(RuntimeError): + zipf.close() + w1.write(msg2) + with zipf.open('baz', mode='w') as w2: + w2.write(msg3) + + with zipfile.ZipFile(TESTFN2, 'r') as zipf: + self.assertEqual(zipf.read('foo'), msg1) + self.assertEqual(zipf.read('bar'), msg2) + self.assertEqual(zipf.read('baz'), msg3) + self.assertEqual(zipf.namelist(), ['foo', 'bar', 'baz']) + def tearDown(self): unlink(TESTFN) unlink(TESTFN2) @@ -1761,6 +1803,22 @@ class UnseekableTests(unittest.TestCase): with zipf.open('twos') as zopen: self.assertEqual(zopen.read(), b'222') + def test_open_write(self): + for wrapper in (lambda f: f), Tellable, Unseekable: + with self.subTest(wrapper=wrapper): + f = io.BytesIO() + f.write(b'abc') + bf = io.BufferedWriter(f) + with zipfile.ZipFile(wrapper(bf), 'w', zipfile.ZIP_STORED) as zipf: + with zipf.open('ones', 'w') as zopen: + zopen.write(b'111') + with zipf.open('twos', 'w') as zopen: + zopen.write(b'222') + self.assertEqual(f.getvalue()[:5], b'abcPK') + with zipfile.ZipFile(f) as zipf: + self.assertEqual(zipf.read('ones'), b'111') + self.assertEqual(zipf.read('twos'), b'222') + @requires_zlib class TestsWithMultipleOpens(unittest.TestCase): @@ -1870,6 +1928,19 @@ class TestsWithMultipleOpens(unittest.TestCase): with open(os.devnull) as f: self.assertLess(f.fileno(), 100) + def test_write_while_reading(self): + with zipfile.ZipFile(TESTFN2, 'w', zipfile.ZIP_DEFLATED) as zipf: + zipf.writestr('ones', self.data1) + with zipfile.ZipFile(TESTFN2, 'a', zipfile.ZIP_DEFLATED) as zipf: + with zipf.open('ones', 'r') as r1: + data1 = r1.read(500) + with zipf.open('twos', 'w') as w1: + w1.write(self.data2) + data1 += r1.read() + self.assertEqual(data1, self.data1) + with zipfile.ZipFile(TESTFN2) as zipf: + self.assertEqual(zipf.read('twos'), self.data2) + def tearDown(self): unlink(TESTFN2) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index e0598d27ed7..03dead53171 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -686,14 +686,19 @@ def _get_decompressor(compress_type): class _SharedFile: - def __init__(self, file, pos, close, lock): + def __init__(self, file, pos, close, lock, writing): self._file = file self._pos = pos self._close = close self._lock = lock + self._writing = writing def read(self, n=-1): with self._lock: + if self._writing(): + raise RuntimeError("Can't read from the ZIP file while there " + "is an open writing handle on it. " + "Close the writing handle before trying to read.") self._file.seek(self._pos) data = self._file.read(n) self._pos = self._file.tell() @@ -993,6 +998,76 @@ class ZipExtFile(io.BufferedIOBase): super().close() +class _ZipWriteFile(io.BufferedIOBase): + def __init__(self, zf, zinfo, zip64): + self._zinfo = zinfo + self._zip64 = zip64 + self._zipfile = zf + self._compressor = _get_compressor(zinfo.compress_type) + self._file_size = 0 + self._compress_size = 0 + self._crc = 0 + + @property + def _fileobj(self): + return self._zipfile.fp + + def writable(self): + return True + + def write(self, data): + nbytes = len(data) + self._file_size += nbytes + self._crc = crc32(data, self._crc) + if self._compressor: + data = self._compressor.compress(data) + self._compress_size += len(data) + self._fileobj.write(data) + return nbytes + + def close(self): + super().close() + # Flush any data from the compressor, and update header info + if self._compressor: + buf = self._compressor.flush() + self._compress_size += len(buf) + self._fileobj.write(buf) + self._zinfo.compress_size = self._compress_size + else: + self._zinfo.compress_size = self._file_size + self._zinfo.CRC = self._crc + self._zinfo.file_size = self._file_size + + # Write updated header info + if self._zinfo.flag_bits & 0x08: + # Write CRC and file sizes after the file data + fmt = ' ZIP64_LIMIT: + raise RuntimeError('File size unexpectedly exceeded ZIP64 ' + 'limit') + if self._compress_size > ZIP64_LIMIT: + raise RuntimeError('Compressed size unexpectedly exceeded ' + 'ZIP64 limit') + # Seek backwards and write file header (which will now include + # correct CRC and file sizes) + + # Preserve current position in file + self._zipfile.start_dir = self._fileobj.tell() + self._fileobj.seek(self._zinfo.header_offset) + self._fileobj.write(self._zinfo.FileHeader(self._zip64)) + self._fileobj.seek(self._zipfile.start_dir) + + self._zipfile._writing = False + + # Successfully written: Add file to our caches + self._zipfile.filelist.append(self._zinfo) + self._zipfile.NameToInfo[self._zinfo.filename] = self._zinfo + class ZipFile: """ Class with methods to open, read, write, close, list zip files. @@ -1055,6 +1130,7 @@ class ZipFile: self._fileRefCnt = 1 self._lock = threading.RLock() self._seekable = True + self._writing = False try: if mode == 'r': @@ -1267,30 +1343,59 @@ class ZipFile: with self.open(name, "r", pwd) as fp: return fp.read() - def open(self, name, mode="r", pwd=None): - """Return file-like object for 'name'.""" - if mode not in ("r", "U", "rU"): - raise RuntimeError('open() requires mode "r", "U", or "rU"') + def open(self, name, mode="r", pwd=None, force_zip64=False): + """Return file-like object for 'name'. + + name is a string for the file name within the ZIP file, or a ZipInfo + object. + + mode should be 'r' to read a file already in the ZIP file, or 'w' to + write to a file newly added to the archive. + + pwd is the password to decrypt files (only used for reading). + + When writing, if the file size is not known in advance but may exceed + 2 GiB, pass force_zip64 to use the ZIP64 format, which can handle large + files. If the size is known in advance, it is best to pass a ZipInfo + instance for name, with zinfo.file_size set. + """ + if mode not in {"r", "w", "U", "rU"}: + raise RuntimeError('open() requires mode "r", "w", "U", or "rU"') if 'U' in mode: import warnings warnings.warn("'U' mode is deprecated", DeprecationWarning, 2) if pwd and not isinstance(pwd, bytes): raise TypeError("pwd: expected bytes, got %s" % type(pwd)) + if pwd and (mode == "w"): + raise ValueError("pwd is only supported for reading files") if not self.fp: raise RuntimeError( - "Attempt to read ZIP archive that was already closed") + "Attempt to use ZIP archive that was already closed") # Make sure we have an info object if isinstance(name, ZipInfo): # 'name' is already an info object zinfo = name + elif mode == 'w': + zinfo = ZipInfo(name) + zinfo.compress_type = self.compression else: # Get info object for name zinfo = self.getinfo(name) + if mode == 'w': + return self._open_to_write(zinfo, force_zip64=force_zip64) + + if self._writing: + raise RuntimeError("Can't read from the ZIP file while there " + "is an open writing handle on it. " + "Close the writing handle before trying to read.") + + # Open for reading: self._fileRefCnt += 1 - zef_file = _SharedFile(self.fp, zinfo.header_offset, self._fpclose, self._lock) + zef_file = _SharedFile(self.fp, zinfo.header_offset, + self._fpclose, self._lock, lambda: self._writing) try: # Skip the file header: fheader = zef_file.read(sizeFileHeader) @@ -1355,6 +1460,49 @@ class ZipFile: zef_file.close() raise + def _open_to_write(self, zinfo, force_zip64=False): + if force_zip64 and not self._allowZip64: + raise ValueError( + "force_zip64 is True, but allowZip64 was False when opening " + "the ZIP file." + ) + if self._writing: + raise RuntimeError("Can't write to the ZIP file while there is " + "another write handle open on it. " + "Close the first handle before opening another.") + + # Sizes and CRC are overwritten with correct data after processing the file + if not hasattr(zinfo, 'file_size'): + zinfo.file_size = 0 + zinfo.compress_size = 0 + zinfo.CRC = 0 + + zinfo.flag_bits = 0x00 + if zinfo.compress_type == ZIP_LZMA: + # Compressed data includes an end-of-stream (EOS) marker + zinfo.flag_bits |= 0x02 + if not self._seekable: + zinfo.flag_bits |= 0x08 + + if not zinfo.external_attr: + zinfo.external_attr = 0o600 << 16 # permissions: ?rw------- + + # Compressed size can be larger than uncompressed size + zip64 = self._allowZip64 and \ + (force_zip64 or zinfo.file_size * 1.05 > ZIP64_LIMIT) + + if self._seekable: + self.fp.seek(self.start_dir) + zinfo.header_offset = self.fp.tell() + + self._writecheck(zinfo) + self._didModify = True + + self.fp.write(zinfo.FileHeader(zip64)) + + self._writing = True + return _ZipWriteFile(self, zinfo, zip64) + def extract(self, member, path=None, pwd=None): """Extract a member from the archive to the current working directory, using its full name. Its file information is extracted as accurately @@ -1464,6 +1612,10 @@ class ZipFile: if not self.fp: raise RuntimeError( "Attempt to write to ZIP archive that was already closed") + if self._writing: + raise RuntimeError( + "Can't write to ZIP archive while an open writing handle exists" + ) zinfo = ZipInfo.from_file(filename, arcname) @@ -1476,75 +1628,25 @@ class ZipFile: else: zinfo.compress_type = self.compression - with self._lock: - if self._seekable: - self.fp.seek(self.start_dir) - zinfo.header_offset = self.fp.tell() # Start of header bytes - if zinfo.compress_type == ZIP_LZMA: + if zinfo.is_dir(): + with self._lock: + if self._seekable: + self.fp.seek(self.start_dir) + zinfo.header_offset = self.fp.tell() # Start of header bytes + if zinfo.compress_type == ZIP_LZMA: # Compressed data includes an end-of-stream (EOS) marker - zinfo.flag_bits |= 0x02 + zinfo.flag_bits |= 0x02 - self._writecheck(zinfo) - self._didModify = True + self._writecheck(zinfo) + self._didModify = True - if zinfo.is_dir(): self.filelist.append(zinfo) self.NameToInfo[zinfo.filename] = zinfo self.fp.write(zinfo.FileHeader(False)) self.start_dir = self.fp.tell() - return - - cmpr = _get_compressor(zinfo.compress_type) - if not self._seekable: - zinfo.flag_bits |= 0x08 - with open(filename, "rb") as fp: - # Must overwrite CRC and sizes with correct data later - zinfo.CRC = CRC = 0 - zinfo.compress_size = compress_size = 0 - # Compressed size can be larger than uncompressed size - zip64 = self._allowZip64 and \ - zinfo.file_size * 1.05 > ZIP64_LIMIT - self.fp.write(zinfo.FileHeader(zip64)) - file_size = 0 - while 1: - buf = fp.read(1024 * 8) - if not buf: - break - file_size = file_size + len(buf) - CRC = crc32(buf, CRC) - if cmpr: - buf = cmpr.compress(buf) - compress_size = compress_size + len(buf) - self.fp.write(buf) - if cmpr: - buf = cmpr.flush() - compress_size = compress_size + len(buf) - self.fp.write(buf) - zinfo.compress_size = compress_size - else: - zinfo.compress_size = file_size - zinfo.CRC = CRC - zinfo.file_size = file_size - if zinfo.flag_bits & 0x08: - # Write CRC and file sizes after the file data - fmt = ' ZIP64_LIMIT: - raise RuntimeError('File size has increased during compressing') - if compress_size > ZIP64_LIMIT: - raise RuntimeError('Compressed size larger than uncompressed size') - # Seek backwards and write file header (which will now include - # correct CRC and file sizes) - self.start_dir = self.fp.tell() # Preserve current position in file - self.fp.seek(zinfo.header_offset) - self.fp.write(zinfo.FileHeader(zip64)) - self.fp.seek(self.start_dir) - self.filelist.append(zinfo) - self.NameToInfo[zinfo.filename] = zinfo + else: + with open(filename, "rb") as src, self.open(zinfo, 'w') as dest: + shutil.copyfileobj(src, dest, 1024*8) def writestr(self, zinfo_or_arcname, data, compress_type=None): """Write a file into the archive. The contents is 'data', which @@ -1569,45 +1671,18 @@ class ZipFile: if not self.fp: raise RuntimeError( "Attempt to write to ZIP archive that was already closed") + if self._writing: + raise RuntimeError( + "Can't write to ZIP archive while an open writing handle exists." + ) + + if compress_type is not None: + zinfo.compress_type = compress_type zinfo.file_size = len(data) # Uncompressed size with self._lock: - if self._seekable: - self.fp.seek(self.start_dir) - zinfo.header_offset = self.fp.tell() # Start of header data - if compress_type is not None: - zinfo.compress_type = compress_type - zinfo.header_offset = self.fp.tell() # Start of header data - if compress_type is not None: - zinfo.compress_type = compress_type - if zinfo.compress_type == ZIP_LZMA: - # Compressed data includes an end-of-stream (EOS) marker - zinfo.flag_bits |= 0x02 - - self._writecheck(zinfo) - self._didModify = True - zinfo.CRC = crc32(data) # CRC-32 checksum - co = _get_compressor(zinfo.compress_type) - if co: - data = co.compress(data) + co.flush() - zinfo.compress_size = len(data) # Compressed size - else: - zinfo.compress_size = zinfo.file_size - zip64 = zinfo.file_size > ZIP64_LIMIT or \ - zinfo.compress_size > ZIP64_LIMIT - if zip64 and not self._allowZip64: - raise LargeZipFile("Filesize would require ZIP64 extensions") - self.fp.write(zinfo.FileHeader(zip64)) - self.fp.write(data) - if zinfo.flag_bits & 0x08: - # Write CRC and file sizes after the file data - fmt = '