From b44898299a2ed97045c270f6474785da2ff07ced Mon Sep 17 00:00:00 2001 From: Tim Hatch Date: Wed, 27 Mar 2024 23:54:51 -0700 Subject: [PATCH] gh-89739: gh-77140: Support zip64 in zipimport (GH-94146) * Reads zip64 files as produced by the zipfile module * Include tests (somewhat slow, however, because of the need to create "large" zips) * About the same amount of strictness reading invalid zip files as zipfile has * Still works on files with prepended data (like pex) There are a lot more test cases at https://github.com/thatch/zipimport64/ that give me confidence that this works for real-world files. Fixes #89739 and #77140. --------- Co-authored-by: Itamar Ostricher Reviewed-by: Gregory P. Smith --- Doc/library/zipimport.rst | 3 + Doc/whatsnew/3.13.rst | 6 + Lib/importlib/_bootstrap_external.py | 5 + Lib/test/test_zipimport.py | 12 ++ Lib/zipimport.py | 166 ++++++++++++++---- ...2-06-22-14-45-32.gh-issue-89739.CqZcRL.rst | 1 + 6 files changed, 154 insertions(+), 39 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-06-22-14-45-32.gh-issue-89739.CqZcRL.rst diff --git a/Doc/library/zipimport.rst b/Doc/library/zipimport.rst index 47c81f0e636..7a8c837307e 100644 --- a/Doc/library/zipimport.rst +++ b/Doc/library/zipimport.rst @@ -30,6 +30,9 @@ Any files may be present in the ZIP archive, but importers are only invoked for corresponding :file:`.pyc` file, meaning that if a ZIP archive doesn't contain :file:`.pyc` files, importing may be rather slow. +.. versionchanged:: 3.13 + ZIP64 is supported + .. versionchanged:: 3.8 Previously, ZIP archives with an archive comment were not supported. diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index e6234bf974e..5a5c506d83d 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -700,6 +700,12 @@ xml.etree.ElementTree :func:`~xml.etree.ElementTree.iterparse` for explicit cleaning up. (Contributed by Serhiy Storchaka in :gh:`69893`.) +zipimport +--------- + +* Gains support for ZIP64 format files. Everybody loves huge code right? + (Contributed by Tim Hatch in :gh:`94146`.) + Optimizations ============= diff --git a/Lib/importlib/_bootstrap_external.py b/Lib/importlib/_bootstrap_external.py index 4749a627c50..0a11dc9efc2 100644 --- a/Lib/importlib/_bootstrap_external.py +++ b/Lib/importlib/_bootstrap_external.py @@ -81,6 +81,11 @@ def _pack_uint32(x): return (int(x) & 0xFFFFFFFF).to_bytes(4, 'little') +def _unpack_uint64(data): + """Convert 8 bytes in little-endian to an integer.""" + assert len(data) == 8 + return int.from_bytes(data, 'little') + def _unpack_uint32(data): """Convert 4 bytes in little-endian to an integer.""" assert len(data) == 4 diff --git a/Lib/test/test_zipimport.py b/Lib/test/test_zipimport.py index c12798d221e..ae497002943 100644 --- a/Lib/test/test_zipimport.py +++ b/Lib/test/test_zipimport.py @@ -128,6 +128,10 @@ class UncompressedZipImportTestCase(ImportHooksBaseTestCase): f.write(stuff) f.write(data) + def getZip64Files(self): + # This is the simplest way to make zipfile generate the zip64 EOCD block + return {f"f{n}.py": (NOW, test_src) for n in range(65537)} + def doTest(self, expected_ext, files, *modules, **kw): self.makeZip(files, **kw) @@ -798,6 +802,14 @@ class UncompressedZipImportTestCase(ImportHooksBaseTestCase): files = {TESTMOD + ".py": (NOW, test_src)} self.doTest(".py", files, TESTMOD, comment=b"c" * ((1 << 16) - 1)) + def testZip64(self): + files = self.getZip64Files() + self.doTest(".py", files, "f6") + + def testZip64CruftAndComment(self): + files = self.getZip64Files() + self.doTest(".py", files, "f65536", comment=b"c" * ((1 << 16) - 1)) + @support.requires_zlib() class CompressedZipImportTestCase(UncompressedZipImportTestCase): diff --git a/Lib/zipimport.py b/Lib/zipimport.py index 823a82ee830..21d2dca46f5 100644 --- a/Lib/zipimport.py +++ b/Lib/zipimport.py @@ -15,7 +15,7 @@ to Zip archives. #from importlib import _bootstrap_external #from importlib import _bootstrap # for _verbose_message import _frozen_importlib_external as _bootstrap_external -from _frozen_importlib_external import _unpack_uint16, _unpack_uint32 +from _frozen_importlib_external import _unpack_uint16, _unpack_uint32, _unpack_uint64 import _frozen_importlib as _bootstrap # for _verbose_message import _imp # for check_hash_based_pycs import _io # for open @@ -40,8 +40,14 @@ _zip_directory_cache = {} _module_type = type(sys) END_CENTRAL_DIR_SIZE = 22 -STRING_END_ARCHIVE = b'PK\x05\x06' +END_CENTRAL_DIR_SIZE_64 = 56 +END_CENTRAL_DIR_LOCATOR_SIZE_64 = 20 +STRING_END_ARCHIVE = b'PK\x05\x06' # standard EOCD signature +STRING_END_LOCATOR_64 = b'PK\x06\x07' # Zip64 EOCD Locator signature +STRING_END_ZIP_64 = b'PK\x06\x06' # Zip64 EOCD signature MAX_COMMENT_LEN = (1 << 16) - 1 +MAX_UINT32 = 0xffffffff +ZIP64_EXTRA_TAG = 0x1 class zipimporter(_bootstrap_external._LoaderBasics): """zipimporter(archivepath) -> zipimporter object @@ -356,49 +362,72 @@ def _read_directory(archive): # to not cause problems when some runs 'python3 /dev/fd/9 9= 0 and pos64+END_CENTRAL_DIR_SIZE_64+END_CENTRAL_DIR_LOCATOR_SIZE_64==pos): + # Zip64 at "correct" offset from standard EOCD + buffer = data[pos64:pos64 + END_CENTRAL_DIR_SIZE_64] + if len(buffer) != END_CENTRAL_DIR_SIZE_64: + raise ZipImportError( + f"corrupt Zip64 file: Expected {END_CENTRAL_DIR_SIZE_64} byte " + f"zip64 central directory, but read {len(buffer)} bytes.", + path=archive) + header_position = file_size - len(data) + pos64 + + central_directory_size = _unpack_uint64(buffer[40:48]) + central_directory_position = _unpack_uint64(buffer[48:56]) + num_entries = _unpack_uint64(buffer[24:32]) + elif pos >= 0: buffer = data[pos:pos+END_CENTRAL_DIR_SIZE] if len(buffer) != END_CENTRAL_DIR_SIZE: raise ZipImportError(f"corrupt Zip file: {archive!r}", path=archive) + header_position = file_size - len(data) + pos - header_size = _unpack_uint32(buffer[12:16]) - header_offset = _unpack_uint32(buffer[16:20]) - if header_position < header_size: + # Buffer now contains a valid EOCD, and header_position gives the + # starting position of it. + central_directory_size = _unpack_uint32(buffer[12:16]) + central_directory_position = _unpack_uint32(buffer[16:20]) + num_entries = _unpack_uint16(buffer[8:10]) + + # N.b. if someday you want to prefer the standard (non-zip64) EOCD, + # you need to adjust position by 76 for arc to be 0. + else: + raise ZipImportError(f'not a Zip file: {archive!r}', + path=archive) + + # Buffer now contains a valid EOCD, and header_position gives the + # starting position of it. + # XXX: These are cursory checks but are not as exact or strict as they + # could be. Checking the arc-adjusted value is probably good too. + if header_position < central_directory_size: raise ZipImportError(f'bad central directory size: {archive!r}', path=archive) - if header_position < header_offset: + if header_position < central_directory_position: raise ZipImportError(f'bad central directory offset: {archive!r}', path=archive) - header_position -= header_size - arc_offset = header_position - header_offset + header_position -= central_directory_size + # On just-a-zipfile these values are the same and arc_offset is zero; if + # the file has some bytes prepended, `arc_offset` is the number of such + # bytes. This is used for pex as well as self-extracting .exe. + arc_offset = header_position - central_directory_position if arc_offset < 0: raise ZipImportError(f'bad central directory size or offset: {archive!r}', path=archive) @@ -415,6 +444,11 @@ def _read_directory(archive): raise EOFError('EOF read where not expected') # Start of file header if buffer[:4] != b'PK\x01\x02': + if count != num_entries: + raise ZipImportError( + f"mismatched num_entries: {count} should be {num_entries} in {archive!r}", + path=archive, + ) break # Bad: Central Dir File Header if len(buffer) != 46: raise EOFError('EOF read where not expected') @@ -430,9 +464,6 @@ def _read_directory(archive): comment_size = _unpack_uint16(buffer[32:34]) file_offset = _unpack_uint32(buffer[42:46]) header_size = name_size + extra_size + comment_size - if file_offset > header_offset: - raise ZipImportError(f'bad local header offset: {archive!r}', path=archive) - file_offset += arc_offset try: name = fp.read(name_size) @@ -444,7 +475,10 @@ def _read_directory(archive): # slower than reading the data because fseek flushes stdio's # internal buffers. See issue #8745. try: - if len(fp.read(header_size - name_size)) != header_size - name_size: + extra_data_len = header_size - name_size + extra_data = memoryview(fp.read(extra_data_len)) + + if len(extra_data) != extra_data_len: raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive) except OSError: raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive) @@ -461,6 +495,60 @@ def _read_directory(archive): name = name.replace('/', path_sep) path = _bootstrap_external._path_join(archive, name) + + # Ordering matches unpacking below. + if ( + file_size == MAX_UINT32 or + data_size == MAX_UINT32 or + file_offset == MAX_UINT32 + ): + # need to decode extra_data looking for a zip64 extra (which might not + # be present) + while extra_data: + if len(extra_data) < 4: + raise ZipImportError(f"can't read header extra: {archive!r}", path=archive) + tag = _unpack_uint16(extra_data[:2]) + size = _unpack_uint16(extra_data[2:4]) + if len(extra_data) < 4 + size: + raise ZipImportError(f"can't read header extra: {archive!r}", path=archive) + if tag == ZIP64_EXTRA_TAG: + if (len(extra_data) - 4) % 8 != 0: + raise ZipImportError(f"can't read header extra: {archive!r}", path=archive) + num_extra_values = (len(extra_data) - 4) // 8 + if num_extra_values > 3: + raise ZipImportError(f"can't read header extra: {archive!r}", path=archive) + values = struct.unpack_from(f"<{min(num_extra_values, 3)}Q", + extra_data, offset=4) + + # N.b. Here be dragons: the ordering of these is different than + # the header fields, and it's really easy to get it wrong since + # naturally-occuring zips that use all 3 are >4GB + if file_size == MAX_UINT32: + file_size = values.pop(0) + if data_size == MAX_UINT32: + data_size = values.pop(0) + if file_offset == MAX_UINT32: + file_offset = values.pop(0) + + break + + # For a typical zip, this bytes-slicing only happens 2-3 times, on + # small data like timestamps and filesizes. + extra_data = extra_data[4+size:] + else: + _bootstrap._verbose_message( + "zipimport: suspected zip64 but no zip64 extra for {!r}", + path, + ) + # XXX These two statements seem swapped because `central_directory_position` + # is a position within the actual file, but `file_offset` (when compared) is + # as encoded in the entry, not adjusted for this file. + # N.b. this must be after we've potentially read the zip64 extra which can + # change `file_offset`. + if file_offset > central_directory_position: + raise ZipImportError(f'bad local header offset: {archive!r}', path=archive) + file_offset += arc_offset + t = (path, compress, data_size, file_size, file_offset, time, date, crc) files[name] = t count += 1 diff --git a/Misc/NEWS.d/next/Library/2022-06-22-14-45-32.gh-issue-89739.CqZcRL.rst b/Misc/NEWS.d/next/Library/2022-06-22-14-45-32.gh-issue-89739.CqZcRL.rst new file mode 100644 index 00000000000..0358c0107cb --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-06-22-14-45-32.gh-issue-89739.CqZcRL.rst @@ -0,0 +1 @@ +The :mod:`zipimport` module can now read ZIP64 files.