From 066df4fd454d6ff9be66e80b2a65995b10af174f Mon Sep 17 00:00:00 2001 From: John Jolly Date: Tue, 30 Jan 2018 01:51:35 -0700 Subject: [PATCH] bpo-22908: Add seek and tell functionality to ZipExtFile (GH-4966) This allows for nested zip files, tar files within zip files, zip files within tar files, etc. Contributed by: John Jolly --- Doc/library/zipfile.rst | 6 +- Lib/test/test_zipfile.py | 34 ++++++++ Lib/zipfile.py | 82 +++++++++++++++++++ .../2017-12-21-22-00-11.bpo-22908.cVm89I.rst | 2 + 4 files changed, 121 insertions(+), 3 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2017-12-21-22-00-11.bpo-22908.cVm89I.rst diff --git a/Doc/library/zipfile.rst b/Doc/library/zipfile.rst index d58efe0b417..7c9a8c80225 100644 --- a/Doc/library/zipfile.rst +++ b/Doc/library/zipfile.rst @@ -246,9 +246,9 @@ ZipFile Objects With *mode* ``'r'`` the file-like object (``ZipExtFile``) is read-only and provides the following methods: :meth:`~io.BufferedIOBase.read`, :meth:`~io.IOBase.readline`, - :meth:`~io.IOBase.readlines`, :meth:`__iter__`, - :meth:`~iterator.__next__`. These objects can operate independently of - the ZipFile. + :meth:`~io.IOBase.readlines`, :meth:`~io.IOBase.seek`, + :meth:`~io.IOBase.tell`, :meth:`__iter__`, :meth:`~iterator.__next__`. + These objects can operate independently of the ZipFile. With ``mode='w'``, a writable file handle is returned, which supports the :meth:`~io.BufferedIOBase.write` method. While a writable file handle is open, diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py index 94db858a151..61c3e349a69 100644 --- a/Lib/test/test_zipfile.py +++ b/Lib/test/test_zipfile.py @@ -1628,6 +1628,40 @@ class OtherTests(unittest.TestCase): self.assertEqual(zipf.read('baz'), msg3) self.assertEqual(zipf.namelist(), ['foo', 'bar', 'baz']) + def test_seek_tell(self): + # Test seek functionality + txt = b"Where's Bruce?" + bloc = txt.find(b"Bruce") + # Check seek on a file + with zipfile.ZipFile(TESTFN, "w") as zipf: + zipf.writestr("foo.txt", txt) + with zipfile.ZipFile(TESTFN, "r") as zipf: + with zipf.open("foo.txt", "r") as fp: + fp.seek(bloc, os.SEEK_SET) + self.assertEqual(fp.tell(), bloc) + fp.seek(-bloc, os.SEEK_CUR) + self.assertEqual(fp.tell(), 0) + fp.seek(bloc, os.SEEK_CUR) + self.assertEqual(fp.tell(), bloc) + self.assertEqual(fp.read(5), txt[bloc:bloc+5]) + fp.seek(0, os.SEEK_END) + self.assertEqual(fp.tell(), len(txt)) + # Check seek on memory file + data = io.BytesIO() + with zipfile.ZipFile(data, mode="w") as zipf: + zipf.writestr("foo.txt", txt) + with zipfile.ZipFile(data, mode="r") as zipf: + with zipf.open("foo.txt", "r") as fp: + fp.seek(bloc, os.SEEK_SET) + self.assertEqual(fp.tell(), bloc) + fp.seek(-bloc, os.SEEK_CUR) + self.assertEqual(fp.tell(), 0) + fp.seek(bloc, os.SEEK_CUR) + self.assertEqual(fp.tell(), bloc) + self.assertEqual(fp.read(5), txt[bloc:bloc+5]) + fp.seek(0, os.SEEK_END) + self.assertEqual(fp.tell(), len(txt)) + def tearDown(self): unlink(TESTFN) unlink(TESTFN2) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index f9db45f58a2..5df7b1bf75b 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -696,6 +696,18 @@ class _SharedFile: self._close = close self._lock = lock self._writing = writing + self.seekable = file.seekable + self.tell = file.tell + + def seek(self, offset, whence=0): + with self._lock: + if self.writing(): + raise ValueError("Can't reposition in the ZIP file while " + "there is an open writing handle on it. " + "Close the writing handle before trying to read.") + self._file.seek(self._pos) + self._pos = self._file.tell() + return self._pos def read(self, n=-1): with self._lock: @@ -746,6 +758,9 @@ class ZipExtFile(io.BufferedIOBase): # Read from compressed files in 4k blocks. MIN_READ_SIZE = 4096 + # Chunk size to read during seek + MAX_SEEK_READ = 1 << 24 + def __init__(self, fileobj, mode, zipinfo, decrypter=None, close_fileobj=False): self._fileobj = fileobj @@ -778,6 +793,17 @@ class ZipExtFile(io.BufferedIOBase): else: self._expected_crc = None + self._seekable = False + try: + if fileobj.seekable(): + self._orig_compress_start = fileobj.tell() + self._orig_compress_size = zipinfo.compress_size + self._orig_file_size = zipinfo.file_size + self._orig_start_crc = self._running_crc + self._seekable = True + except AttributeError: + pass + def __repr__(self): result = ['<%s.%s' % (self.__class__.__module__, self.__class__.__qualname__)] @@ -963,6 +989,62 @@ class ZipExtFile(io.BufferedIOBase): finally: super().close() + def seekable(self): + return self._seekable + + def seek(self, offset, whence=0): + if not self._seekable: + raise io.UnsupportedOperation("underlying stream is not seekable") + curr_pos = self.tell() + if whence == 0: # Seek from start of file + new_pos = offset + elif whence == 1: # Seek from current position + new_pos = curr_pos + offset + elif whence == 2: # Seek from EOF + new_pos = self._orig_file_size + offset + else: + raise ValueError("whence must be os.SEEK_SET (0), " + "os.SEEK_CUR (1), or os.SEEK_END (2)") + + if new_pos > self._orig_file_size: + new_pos = self._orig_file_size + + if new_pos < 0: + new_pos = 0 + + read_offset = new_pos - curr_pos + buff_offset = read_offset + self._offset + + if buff_offset >= 0 and buff_offset < len(self._readbuffer): + # Just move the _offset index if the new position is in the _readbuffer + self._offset = buff_offset + read_offset = 0 + elif read_offset < 0: + # Position is before the current position. Reset the ZipExtFile + + self._fileobj.seek(self._orig_compress_start) + self._running_crc = self._orig_start_crc + self._compress_left = self._orig_compress_size + self._left = self._orig_file_size + self._readbuffer = b'' + self._offset = 0 + self._decompressor = zipfile._get_decompressor(self._compress_type) + self._eof = False + read_offset = new_pos + + while read_offset > 0: + read_len = min(self.MAX_SEEK_READ, read_offset) + self.read(read_len) + read_offset -= read_len + + return self.tell() + + def tell(self): + if not self._seekable: + raise io.UnsupportedOperation("underlying stream is not seekable") + filepos = self._orig_file_size - self._left - len(self._readbuffer) + self._offset + return filepos + class _ZipWriteFile(io.BufferedIOBase): def __init__(self, zf, zinfo, zip64): diff --git a/Misc/NEWS.d/next/Library/2017-12-21-22-00-11.bpo-22908.cVm89I.rst b/Misc/NEWS.d/next/Library/2017-12-21-22-00-11.bpo-22908.cVm89I.rst new file mode 100644 index 00000000000..4f3cc016601 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2017-12-21-22-00-11.bpo-22908.cVm89I.rst @@ -0,0 +1,2 @@ +Added seek and tell to the ZipExtFile class. This only works if the file +object used to open the zipfile is seekable.