mirror of https://github.com/python/cpython
gh-89739: gh-77140: Support zip64 in zipimport (GH-94146)
* Reads zip64 files as produced by the zipfile module * Include tests (somewhat slow, however, because of the need to create "large" zips) * About the same amount of strictness reading invalid zip files as zipfile has * Still works on files with prepended data (like pex) There are a lot more test cases at https://github.com/thatch/zipimport64/ that give me confidence that this works for real-world files. Fixes #89739 and #77140. --------- Co-authored-by: Itamar Ostricher <itamarost@gmail.com> Reviewed-by: Gregory P. Smith <greg@krypto.org>
This commit is contained in:
parent
2cedd25c14
commit
b44898299a
|
@ -30,6 +30,9 @@ Any files may be present in the ZIP archive, but importers are only invoked for
|
|||
corresponding :file:`.pyc` file, meaning that if a ZIP archive
|
||||
doesn't contain :file:`.pyc` files, importing may be rather slow.
|
||||
|
||||
.. versionchanged:: 3.13
|
||||
ZIP64 is supported
|
||||
|
||||
.. versionchanged:: 3.8
|
||||
Previously, ZIP archives with an archive comment were not supported.
|
||||
|
||||
|
|
|
@ -700,6 +700,12 @@ xml.etree.ElementTree
|
|||
:func:`~xml.etree.ElementTree.iterparse` for explicit cleaning up.
|
||||
(Contributed by Serhiy Storchaka in :gh:`69893`.)
|
||||
|
||||
zipimport
|
||||
---------
|
||||
|
||||
* Gains support for ZIP64 format files. Everybody loves huge code right?
|
||||
(Contributed by Tim Hatch in :gh:`94146`.)
|
||||
|
||||
|
||||
Optimizations
|
||||
=============
|
||||
|
|
|
@ -81,6 +81,11 @@ def _pack_uint32(x):
|
|||
return (int(x) & 0xFFFFFFFF).to_bytes(4, 'little')
|
||||
|
||||
|
||||
def _unpack_uint64(data):
|
||||
"""Convert 8 bytes in little-endian to an integer."""
|
||||
assert len(data) == 8
|
||||
return int.from_bytes(data, 'little')
|
||||
|
||||
def _unpack_uint32(data):
|
||||
"""Convert 4 bytes in little-endian to an integer."""
|
||||
assert len(data) == 4
|
||||
|
|
|
@ -128,6 +128,10 @@ class UncompressedZipImportTestCase(ImportHooksBaseTestCase):
|
|||
f.write(stuff)
|
||||
f.write(data)
|
||||
|
||||
def getZip64Files(self):
|
||||
# This is the simplest way to make zipfile generate the zip64 EOCD block
|
||||
return {f"f{n}.py": (NOW, test_src) for n in range(65537)}
|
||||
|
||||
def doTest(self, expected_ext, files, *modules, **kw):
|
||||
self.makeZip(files, **kw)
|
||||
|
||||
|
@ -798,6 +802,14 @@ class UncompressedZipImportTestCase(ImportHooksBaseTestCase):
|
|||
files = {TESTMOD + ".py": (NOW, test_src)}
|
||||
self.doTest(".py", files, TESTMOD, comment=b"c" * ((1 << 16) - 1))
|
||||
|
||||
def testZip64(self):
|
||||
files = self.getZip64Files()
|
||||
self.doTest(".py", files, "f6")
|
||||
|
||||
def testZip64CruftAndComment(self):
|
||||
files = self.getZip64Files()
|
||||
self.doTest(".py", files, "f65536", comment=b"c" * ((1 << 16) - 1))
|
||||
|
||||
|
||||
@support.requires_zlib()
|
||||
class CompressedZipImportTestCase(UncompressedZipImportTestCase):
|
||||
|
|
142
Lib/zipimport.py
142
Lib/zipimport.py
|
@ -15,7 +15,7 @@ to Zip archives.
|
|||
#from importlib import _bootstrap_external
|
||||
#from importlib import _bootstrap # for _verbose_message
|
||||
import _frozen_importlib_external as _bootstrap_external
|
||||
from _frozen_importlib_external import _unpack_uint16, _unpack_uint32
|
||||
from _frozen_importlib_external import _unpack_uint16, _unpack_uint32, _unpack_uint64
|
||||
import _frozen_importlib as _bootstrap # for _verbose_message
|
||||
import _imp # for check_hash_based_pycs
|
||||
import _io # for open
|
||||
|
@ -40,8 +40,14 @@ _zip_directory_cache = {}
|
|||
_module_type = type(sys)
|
||||
|
||||
END_CENTRAL_DIR_SIZE = 22
|
||||
STRING_END_ARCHIVE = b'PK\x05\x06'
|
||||
END_CENTRAL_DIR_SIZE_64 = 56
|
||||
END_CENTRAL_DIR_LOCATOR_SIZE_64 = 20
|
||||
STRING_END_ARCHIVE = b'PK\x05\x06' # standard EOCD signature
|
||||
STRING_END_LOCATOR_64 = b'PK\x06\x07' # Zip64 EOCD Locator signature
|
||||
STRING_END_ZIP_64 = b'PK\x06\x06' # Zip64 EOCD signature
|
||||
MAX_COMMENT_LEN = (1 << 16) - 1
|
||||
MAX_UINT32 = 0xffffffff
|
||||
ZIP64_EXTRA_TAG = 0x1
|
||||
|
||||
class zipimporter(_bootstrap_external._LoaderBasics):
|
||||
"""zipimporter(archivepath) -> zipimporter object
|
||||
|
@ -356,16 +362,6 @@ def _read_directory(archive):
|
|||
# to not cause problems when some runs 'python3 /dev/fd/9 9<some_script'
|
||||
start_offset = fp.tell()
|
||||
try:
|
||||
try:
|
||||
fp.seek(-END_CENTRAL_DIR_SIZE, 2)
|
||||
header_position = fp.tell()
|
||||
buffer = fp.read(END_CENTRAL_DIR_SIZE)
|
||||
except OSError:
|
||||
raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
|
||||
if len(buffer) != END_CENTRAL_DIR_SIZE:
|
||||
raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
|
||||
if buffer[:4] != STRING_END_ARCHIVE:
|
||||
# Bad: End of Central Dir signature
|
||||
# Check if there's a comment.
|
||||
try:
|
||||
fp.seek(0, 2)
|
||||
|
@ -373,32 +369,65 @@ def _read_directory(archive):
|
|||
except OSError:
|
||||
raise ZipImportError(f"can't read Zip file: {archive!r}",
|
||||
path=archive)
|
||||
max_comment_start = max(file_size - MAX_COMMENT_LEN -
|
||||
END_CENTRAL_DIR_SIZE, 0)
|
||||
max_comment_plus_dirs_size = (
|
||||
MAX_COMMENT_LEN + END_CENTRAL_DIR_SIZE +
|
||||
END_CENTRAL_DIR_SIZE_64 + END_CENTRAL_DIR_LOCATOR_SIZE_64)
|
||||
max_comment_start = max(file_size - max_comment_plus_dirs_size, 0)
|
||||
try:
|
||||
fp.seek(max_comment_start)
|
||||
data = fp.read()
|
||||
data = fp.read(max_comment_plus_dirs_size)
|
||||
except OSError:
|
||||
raise ZipImportError(f"can't read Zip file: {archive!r}",
|
||||
path=archive)
|
||||
pos = data.rfind(STRING_END_ARCHIVE)
|
||||
if pos < 0:
|
||||
raise ZipImportError(f'not a Zip file: {archive!r}',
|
||||
pos64 = data.rfind(STRING_END_ZIP_64)
|
||||
|
||||
if (pos64 >= 0 and pos64+END_CENTRAL_DIR_SIZE_64+END_CENTRAL_DIR_LOCATOR_SIZE_64==pos):
|
||||
# Zip64 at "correct" offset from standard EOCD
|
||||
buffer = data[pos64:pos64 + END_CENTRAL_DIR_SIZE_64]
|
||||
if len(buffer) != END_CENTRAL_DIR_SIZE_64:
|
||||
raise ZipImportError(
|
||||
f"corrupt Zip64 file: Expected {END_CENTRAL_DIR_SIZE_64} byte "
|
||||
f"zip64 central directory, but read {len(buffer)} bytes.",
|
||||
path=archive)
|
||||
header_position = file_size - len(data) + pos64
|
||||
|
||||
central_directory_size = _unpack_uint64(buffer[40:48])
|
||||
central_directory_position = _unpack_uint64(buffer[48:56])
|
||||
num_entries = _unpack_uint64(buffer[24:32])
|
||||
elif pos >= 0:
|
||||
buffer = data[pos:pos+END_CENTRAL_DIR_SIZE]
|
||||
if len(buffer) != END_CENTRAL_DIR_SIZE:
|
||||
raise ZipImportError(f"corrupt Zip file: {archive!r}",
|
||||
path=archive)
|
||||
|
||||
header_position = file_size - len(data) + pos
|
||||
|
||||
header_size = _unpack_uint32(buffer[12:16])
|
||||
header_offset = _unpack_uint32(buffer[16:20])
|
||||
if header_position < header_size:
|
||||
# Buffer now contains a valid EOCD, and header_position gives the
|
||||
# starting position of it.
|
||||
central_directory_size = _unpack_uint32(buffer[12:16])
|
||||
central_directory_position = _unpack_uint32(buffer[16:20])
|
||||
num_entries = _unpack_uint16(buffer[8:10])
|
||||
|
||||
# N.b. if someday you want to prefer the standard (non-zip64) EOCD,
|
||||
# you need to adjust position by 76 for arc to be 0.
|
||||
else:
|
||||
raise ZipImportError(f'not a Zip file: {archive!r}',
|
||||
path=archive)
|
||||
|
||||
# Buffer now contains a valid EOCD, and header_position gives the
|
||||
# starting position of it.
|
||||
# XXX: These are cursory checks but are not as exact or strict as they
|
||||
# could be. Checking the arc-adjusted value is probably good too.
|
||||
if header_position < central_directory_size:
|
||||
raise ZipImportError(f'bad central directory size: {archive!r}', path=archive)
|
||||
if header_position < header_offset:
|
||||
if header_position < central_directory_position:
|
||||
raise ZipImportError(f'bad central directory offset: {archive!r}', path=archive)
|
||||
header_position -= header_size
|
||||
arc_offset = header_position - header_offset
|
||||
header_position -= central_directory_size
|
||||
# On just-a-zipfile these values are the same and arc_offset is zero; if
|
||||
# the file has some bytes prepended, `arc_offset` is the number of such
|
||||
# bytes. This is used for pex as well as self-extracting .exe.
|
||||
arc_offset = header_position - central_directory_position
|
||||
if arc_offset < 0:
|
||||
raise ZipImportError(f'bad central directory size or offset: {archive!r}', path=archive)
|
||||
|
||||
|
@ -415,6 +444,11 @@ def _read_directory(archive):
|
|||
raise EOFError('EOF read where not expected')
|
||||
# Start of file header
|
||||
if buffer[:4] != b'PK\x01\x02':
|
||||
if count != num_entries:
|
||||
raise ZipImportError(
|
||||
f"mismatched num_entries: {count} should be {num_entries} in {archive!r}",
|
||||
path=archive,
|
||||
)
|
||||
break # Bad: Central Dir File Header
|
||||
if len(buffer) != 46:
|
||||
raise EOFError('EOF read where not expected')
|
||||
|
@ -430,9 +464,6 @@ def _read_directory(archive):
|
|||
comment_size = _unpack_uint16(buffer[32:34])
|
||||
file_offset = _unpack_uint32(buffer[42:46])
|
||||
header_size = name_size + extra_size + comment_size
|
||||
if file_offset > header_offset:
|
||||
raise ZipImportError(f'bad local header offset: {archive!r}', path=archive)
|
||||
file_offset += arc_offset
|
||||
|
||||
try:
|
||||
name = fp.read(name_size)
|
||||
|
@ -444,7 +475,10 @@ def _read_directory(archive):
|
|||
# slower than reading the data because fseek flushes stdio's
|
||||
# internal buffers. See issue #8745.
|
||||
try:
|
||||
if len(fp.read(header_size - name_size)) != header_size - name_size:
|
||||
extra_data_len = header_size - name_size
|
||||
extra_data = memoryview(fp.read(extra_data_len))
|
||||
|
||||
if len(extra_data) != extra_data_len:
|
||||
raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
|
||||
except OSError:
|
||||
raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
|
||||
|
@ -461,6 +495,60 @@ def _read_directory(archive):
|
|||
|
||||
name = name.replace('/', path_sep)
|
||||
path = _bootstrap_external._path_join(archive, name)
|
||||
|
||||
# Ordering matches unpacking below.
|
||||
if (
|
||||
file_size == MAX_UINT32 or
|
||||
data_size == MAX_UINT32 or
|
||||
file_offset == MAX_UINT32
|
||||
):
|
||||
# need to decode extra_data looking for a zip64 extra (which might not
|
||||
# be present)
|
||||
while extra_data:
|
||||
if len(extra_data) < 4:
|
||||
raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
|
||||
tag = _unpack_uint16(extra_data[:2])
|
||||
size = _unpack_uint16(extra_data[2:4])
|
||||
if len(extra_data) < 4 + size:
|
||||
raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
|
||||
if tag == ZIP64_EXTRA_TAG:
|
||||
if (len(extra_data) - 4) % 8 != 0:
|
||||
raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
|
||||
num_extra_values = (len(extra_data) - 4) // 8
|
||||
if num_extra_values > 3:
|
||||
raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
|
||||
values = struct.unpack_from(f"<{min(num_extra_values, 3)}Q",
|
||||
extra_data, offset=4)
|
||||
|
||||
# N.b. Here be dragons: the ordering of these is different than
|
||||
# the header fields, and it's really easy to get it wrong since
|
||||
# naturally-occuring zips that use all 3 are >4GB
|
||||
if file_size == MAX_UINT32:
|
||||
file_size = values.pop(0)
|
||||
if data_size == MAX_UINT32:
|
||||
data_size = values.pop(0)
|
||||
if file_offset == MAX_UINT32:
|
||||
file_offset = values.pop(0)
|
||||
|
||||
break
|
||||
|
||||
# For a typical zip, this bytes-slicing only happens 2-3 times, on
|
||||
# small data like timestamps and filesizes.
|
||||
extra_data = extra_data[4+size:]
|
||||
else:
|
||||
_bootstrap._verbose_message(
|
||||
"zipimport: suspected zip64 but no zip64 extra for {!r}",
|
||||
path,
|
||||
)
|
||||
# XXX These two statements seem swapped because `central_directory_position`
|
||||
# is a position within the actual file, but `file_offset` (when compared) is
|
||||
# as encoded in the entry, not adjusted for this file.
|
||||
# N.b. this must be after we've potentially read the zip64 extra which can
|
||||
# change `file_offset`.
|
||||
if file_offset > central_directory_position:
|
||||
raise ZipImportError(f'bad local header offset: {archive!r}', path=archive)
|
||||
file_offset += arc_offset
|
||||
|
||||
t = (path, compress, data_size, file_size, file_offset, time, date, crc)
|
||||
files[name] = t
|
||||
count += 1
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
The :mod:`zipimport` module can now read ZIP64 files.
|
Loading…
Reference in New Issue