bpo-39595: Improve zipfile.Path performance (#18406)
* Improve zipfile.Path performance on zipfiles with a large number of entries. * 📜🤖 Added by blurb_it. * Add bpo to blurb * Sync with importlib_metadata 1.5 (6fe70ca) * Update blurb. * Remove compatibility code * Add stubs module, omitted from earlier commit Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
This commit is contained in:
parent
e6be9b59a9
commit
e5bd73632e
|
@ -391,6 +391,7 @@ class FastPath:
|
|||
|
||||
def __init__(self, root):
|
||||
self.root = root
|
||||
self.base = os.path.basename(root).lower()
|
||||
|
||||
def joinpath(self, child):
|
||||
return pathlib.Path(self.root, child)
|
||||
|
@ -413,12 +414,11 @@ class FastPath:
|
|||
)
|
||||
|
||||
def is_egg(self, search):
|
||||
root_n_low = os.path.split(self.root)[1].lower()
|
||||
|
||||
base = self.base
|
||||
return (
|
||||
root_n_low == search.normalized + '.egg'
|
||||
or root_n_low.startswith(search.prefix)
|
||||
and root_n_low.endswith('.egg'))
|
||||
base == search.versionless_egg_name
|
||||
or base.startswith(search.prefix)
|
||||
and base.endswith('.egg'))
|
||||
|
||||
def search(self, name):
|
||||
for child in self.children():
|
||||
|
@ -439,6 +439,7 @@ class Prepared:
|
|||
prefix = ''
|
||||
suffixes = '.dist-info', '.egg-info'
|
||||
exact_matches = [''][:0]
|
||||
versionless_egg_name = ''
|
||||
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
|
@ -448,6 +449,7 @@ class Prepared:
|
|||
self.prefix = self.normalized + '-'
|
||||
self.exact_matches = [
|
||||
self.normalized + suffix for suffix in self.suffixes]
|
||||
self.versionless_egg_name = self.normalized + '.egg'
|
||||
|
||||
|
||||
class MetadataPathFinder(DistributionFinder):
|
||||
|
|
|
@ -47,14 +47,28 @@ def tempdir_as_cwd():
|
|||
yield tmp
|
||||
|
||||
|
||||
class SiteDir:
|
||||
@contextlib.contextmanager
|
||||
def install_finder(finder):
|
||||
sys.meta_path.append(finder)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
sys.meta_path.remove(finder)
|
||||
|
||||
|
||||
class Fixtures:
|
||||
def setUp(self):
|
||||
self.fixtures = ExitStack()
|
||||
self.addCleanup(self.fixtures.close)
|
||||
|
||||
|
||||
class SiteDir(Fixtures):
|
||||
def setUp(self):
|
||||
super(SiteDir, self).setUp()
|
||||
self.site_dir = self.fixtures.enter_context(tempdir())
|
||||
|
||||
|
||||
class OnSysPath:
|
||||
class OnSysPath(Fixtures):
|
||||
@staticmethod
|
||||
@contextlib.contextmanager
|
||||
def add_sys_path(dir):
|
||||
|
@ -198,3 +212,8 @@ def build_files(file_defs, prefix=pathlib.Path()):
|
|||
def DALS(str):
|
||||
"Dedent and left-strip"
|
||||
return textwrap.dedent(str).lstrip()
|
||||
|
||||
|
||||
class NullFinder:
|
||||
def find_module(self, name):
|
||||
pass
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
import unittest
|
||||
|
||||
|
||||
class fake_filesystem_unittest:
|
||||
"""
|
||||
Stubbed version of the pyfakefs module
|
||||
"""
|
||||
class TestCase(unittest.TestCase):
|
||||
def setUpPyfakefs(self):
|
||||
self.skipTest("pyfakefs not available")
|
|
@ -7,6 +7,11 @@ import textwrap
|
|||
import unittest
|
||||
import importlib.metadata
|
||||
|
||||
try:
|
||||
import pyfakefs.fake_filesystem_unittest as ffs
|
||||
except ImportError:
|
||||
from .stubs import fake_filesystem_unittest as ffs
|
||||
|
||||
from . import fixtures
|
||||
from importlib.metadata import (
|
||||
Distribution, EntryPoint,
|
||||
|
@ -185,6 +190,33 @@ class DirectoryTest(fixtures.OnSysPath, fixtures.SiteDir, unittest.TestCase):
|
|||
version('foo')
|
||||
|
||||
|
||||
class MissingSysPath(fixtures.OnSysPath, unittest.TestCase):
|
||||
site_dir = '/does-not-exist'
|
||||
|
||||
def test_discovery(self):
|
||||
"""
|
||||
Discovering distributions should succeed even if
|
||||
there is an invalid path on sys.path.
|
||||
"""
|
||||
importlib.metadata.distributions()
|
||||
|
||||
|
||||
class InaccessibleSysPath(fixtures.OnSysPath, ffs.TestCase):
|
||||
site_dir = '/access-denied'
|
||||
|
||||
def setUp(self):
|
||||
super(InaccessibleSysPath, self).setUp()
|
||||
self.setUpPyfakefs()
|
||||
self.fs.create_dir(self.site_dir, perm_bits=000)
|
||||
|
||||
def test_discovery(self):
|
||||
"""
|
||||
Discovering distributions should succeed even if
|
||||
there is an invalid path on sys.path.
|
||||
"""
|
||||
list(importlib.metadata.distributions())
|
||||
|
||||
|
||||
class TestEntryPoints(unittest.TestCase):
|
||||
def __init__(self, *args):
|
||||
super(TestEntryPoints, self).__init__(*args)
|
||||
|
|
|
@ -2724,16 +2724,71 @@ class CommandLineTest(unittest.TestCase):
|
|||
self.assertEqual(f.read(), zf.read(zi))
|
||||
|
||||
|
||||
class TestExecutablePrependedZip(unittest.TestCase):
|
||||
"""Test our ability to open zip files with an executable prepended."""
|
||||
|
||||
def setUp(self):
|
||||
self.exe_zip = findfile('exe_with_zip', subdir='ziptestdata')
|
||||
self.exe_zip64 = findfile('exe_with_z64', subdir='ziptestdata')
|
||||
|
||||
def _test_zip_works(self, name):
|
||||
# bpo28494 sanity check: ensure is_zipfile works on these.
|
||||
self.assertTrue(zipfile.is_zipfile(name),
|
||||
f'is_zipfile failed on {name}')
|
||||
# Ensure we can operate on these via ZipFile.
|
||||
with zipfile.ZipFile(name) as zipfp:
|
||||
for n in zipfp.namelist():
|
||||
data = zipfp.read(n)
|
||||
self.assertIn(b'FAVORITE_NUMBER', data)
|
||||
|
||||
def test_read_zip_with_exe_prepended(self):
|
||||
self._test_zip_works(self.exe_zip)
|
||||
|
||||
def test_read_zip64_with_exe_prepended(self):
|
||||
self._test_zip_works(self.exe_zip64)
|
||||
|
||||
@unittest.skipUnless(sys.executable, 'sys.executable required.')
|
||||
@unittest.skipUnless(os.access('/bin/bash', os.X_OK),
|
||||
'Test relies on #!/bin/bash working.')
|
||||
def test_execute_zip2(self):
|
||||
output = subprocess.check_output([self.exe_zip, sys.executable])
|
||||
self.assertIn(b'number in executable: 5', output)
|
||||
|
||||
@unittest.skipUnless(sys.executable, 'sys.executable required.')
|
||||
@unittest.skipUnless(os.access('/bin/bash', os.X_OK),
|
||||
'Test relies on #!/bin/bash working.')
|
||||
def test_execute_zip64(self):
|
||||
output = subprocess.check_output([self.exe_zip64, sys.executable])
|
||||
self.assertIn(b'number in executable: 5', output)
|
||||
|
||||
|
||||
# Poor man's technique to consume a (smallish) iterable.
|
||||
consume = tuple
|
||||
|
||||
|
||||
# from jaraco.itertools 5.0
|
||||
class jaraco:
|
||||
class itertools:
|
||||
class Counter:
|
||||
def __init__(self, i):
|
||||
self.count = 0
|
||||
self._orig_iter = iter(i)
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
result = next(self._orig_iter)
|
||||
self.count += 1
|
||||
return result
|
||||
|
||||
|
||||
def add_dirs(zf):
|
||||
"""
|
||||
Given a writable zip file zf, inject directory entries for
|
||||
any directories implied by the presence of children.
|
||||
"""
|
||||
for name in zipfile.Path._implied_dirs(zf.namelist()):
|
||||
for name in zipfile.CompleteDirs._implied_dirs(zf.namelist()):
|
||||
zf.writestr(name, b"")
|
||||
return zf
|
||||
|
||||
|
@ -2774,44 +2829,6 @@ def build_alpharep_fixture():
|
|||
return zf
|
||||
|
||||
|
||||
class TestExecutablePrependedZip(unittest.TestCase):
|
||||
"""Test our ability to open zip files with an executable prepended."""
|
||||
|
||||
def setUp(self):
|
||||
self.exe_zip = findfile('exe_with_zip', subdir='ziptestdata')
|
||||
self.exe_zip64 = findfile('exe_with_z64', subdir='ziptestdata')
|
||||
|
||||
def _test_zip_works(self, name):
|
||||
# bpo-28494 sanity check: ensure is_zipfile works on these.
|
||||
self.assertTrue(zipfile.is_zipfile(name),
|
||||
f'is_zipfile failed on {name}')
|
||||
# Ensure we can operate on these via ZipFile.
|
||||
with zipfile.ZipFile(name) as zipfp:
|
||||
for n in zipfp.namelist():
|
||||
data = zipfp.read(n)
|
||||
self.assertIn(b'FAVORITE_NUMBER', data)
|
||||
|
||||
def test_read_zip_with_exe_prepended(self):
|
||||
self._test_zip_works(self.exe_zip)
|
||||
|
||||
def test_read_zip64_with_exe_prepended(self):
|
||||
self._test_zip_works(self.exe_zip64)
|
||||
|
||||
@unittest.skipUnless(sys.executable, 'sys.executable required.')
|
||||
@unittest.skipUnless(os.access('/bin/bash', os.X_OK),
|
||||
'Test relies on #!/bin/bash working.')
|
||||
def test_execute_zip2(self):
|
||||
output = subprocess.check_output([self.exe_zip, sys.executable])
|
||||
self.assertIn(b'number in executable: 5', output)
|
||||
|
||||
@unittest.skipUnless(sys.executable, 'sys.executable required.')
|
||||
@unittest.skipUnless(os.access('/bin/bash', os.X_OK),
|
||||
'Test relies on #!/bin/bash working.')
|
||||
def test_execute_zip64(self):
|
||||
output = subprocess.check_output([self.exe_zip64, sys.executable])
|
||||
self.assertIn(b'number in executable: 5', output)
|
||||
|
||||
|
||||
class TestPath(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.fixtures = contextlib.ExitStack()
|
||||
|
@ -2849,6 +2866,14 @@ class TestPath(unittest.TestCase):
|
|||
i, = h.iterdir()
|
||||
assert i.is_file()
|
||||
|
||||
def test_subdir_is_dir(self):
|
||||
for alpharep in self.zipfile_alpharep():
|
||||
root = zipfile.Path(alpharep)
|
||||
assert (root / 'b').is_dir()
|
||||
assert (root / 'b/').is_dir()
|
||||
assert (root / 'g').is_dir()
|
||||
assert (root / 'g/').is_dir()
|
||||
|
||||
def test_open(self):
|
||||
for alpharep in self.zipfile_alpharep():
|
||||
root = zipfile.Path(alpharep)
|
||||
|
@ -2910,6 +2935,45 @@ class TestPath(unittest.TestCase):
|
|||
root = zipfile.Path(alpharep)
|
||||
assert (root / 'missing dir/').parent.at == ''
|
||||
|
||||
def test_mutability(self):
|
||||
"""
|
||||
If the underlying zipfile is changed, the Path object should
|
||||
reflect that change.
|
||||
"""
|
||||
for alpharep in self.zipfile_alpharep():
|
||||
root = zipfile.Path(alpharep)
|
||||
a, b, g = root.iterdir()
|
||||
alpharep.writestr('foo.txt', 'foo')
|
||||
alpharep.writestr('bar/baz.txt', 'baz')
|
||||
assert any(
|
||||
child.name == 'foo.txt'
|
||||
for child in root.iterdir())
|
||||
assert (root / 'foo.txt').read_text() == 'foo'
|
||||
baz, = (root / 'bar').iterdir()
|
||||
assert baz.read_text() == 'baz'
|
||||
|
||||
HUGE_ZIPFILE_NUM_ENTRIES = 2 ** 13
|
||||
|
||||
def huge_zipfile(self):
|
||||
"""Create a read-only zipfile with a huge number of entries entries."""
|
||||
strm = io.BytesIO()
|
||||
zf = zipfile.ZipFile(strm, "w")
|
||||
for entry in map(str, range(self.HUGE_ZIPFILE_NUM_ENTRIES)):
|
||||
zf.writestr(entry, entry)
|
||||
zf.mode = 'r'
|
||||
return zf
|
||||
|
||||
def test_joinpath_constant_time(self):
|
||||
"""
|
||||
Ensure joinpath on items in zipfile is linear time.
|
||||
"""
|
||||
root = zipfile.Path(self.huge_zipfile())
|
||||
entries = jaraco.itertools.Counter(root.iterdir())
|
||||
for entry in entries:
|
||||
entry.joinpath('suffix')
|
||||
# Check the file iterated all items
|
||||
assert entries.count == self.HUGE_ZIPFILE_NUM_ENTRIES
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
102
Lib/zipfile.py
102
Lib/zipfile.py
|
@ -16,6 +16,8 @@ import struct
|
|||
import sys
|
||||
import threading
|
||||
import time
|
||||
import contextlib
|
||||
from collections import OrderedDict
|
||||
|
||||
try:
|
||||
import zlib # We may need its compression method
|
||||
|
@ -2159,6 +2161,79 @@ def _ancestry(path):
|
|||
path, tail = posixpath.split(path)
|
||||
|
||||
|
||||
class CompleteDirs(ZipFile):
|
||||
"""
|
||||
A ZipFile subclass that ensures that implied directories
|
||||
are always included in the namelist.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _implied_dirs(names):
|
||||
parents = itertools.chain.from_iterable(map(_parents, names))
|
||||
# Deduplicate entries in original order
|
||||
implied_dirs = OrderedDict.fromkeys(
|
||||
p + posixpath.sep for p in parents
|
||||
# Cast names to a set for O(1) lookups
|
||||
if p + posixpath.sep not in set(names)
|
||||
)
|
||||
return implied_dirs
|
||||
|
||||
def namelist(self):
|
||||
names = super(CompleteDirs, self).namelist()
|
||||
return names + list(self._implied_dirs(names))
|
||||
|
||||
def _name_set(self):
|
||||
return set(self.namelist())
|
||||
|
||||
def resolve_dir(self, name):
|
||||
"""
|
||||
If the name represents a directory, return that name
|
||||
as a directory (with the trailing slash).
|
||||
"""
|
||||
names = self._name_set()
|
||||
dirname = name + '/'
|
||||
dir_match = name not in names and dirname in names
|
||||
return dirname if dir_match else name
|
||||
|
||||
@classmethod
|
||||
def make(cls, source):
|
||||
"""
|
||||
Given a source (filename or zipfile), return an
|
||||
appropriate CompleteDirs subclass.
|
||||
"""
|
||||
if isinstance(source, CompleteDirs):
|
||||
return source
|
||||
|
||||
if not isinstance(source, ZipFile):
|
||||
return cls(source)
|
||||
|
||||
# Only allow for FastPath when supplied zipfile is read-only
|
||||
if 'r' not in source.mode:
|
||||
cls = CompleteDirs
|
||||
|
||||
res = cls.__new__(cls)
|
||||
vars(res).update(vars(source))
|
||||
return res
|
||||
|
||||
|
||||
class FastLookup(CompleteDirs):
|
||||
"""
|
||||
ZipFile subclass to ensure implicit
|
||||
dirs exist and are resolved rapidly.
|
||||
"""
|
||||
def namelist(self):
|
||||
with contextlib.suppress(AttributeError):
|
||||
return self.__names
|
||||
self.__names = super(FastLookup, self).namelist()
|
||||
return self.__names
|
||||
|
||||
def _name_set(self):
|
||||
with contextlib.suppress(AttributeError):
|
||||
return self.__lookup
|
||||
self.__lookup = super(FastLookup, self)._name_set()
|
||||
return self.__lookup
|
||||
|
||||
|
||||
class Path:
|
||||
"""
|
||||
A pathlib-compatible interface for zip files.
|
||||
|
@ -2227,7 +2302,7 @@ class Path:
|
|||
__repr = "{self.__class__.__name__}({self.root.filename!r}, {self.at!r})"
|
||||
|
||||
def __init__(self, root, at=""):
|
||||
self.root = root if isinstance(root, ZipFile) else ZipFile(root)
|
||||
self.root = FastLookup.make(root)
|
||||
self.at = at
|
||||
|
||||
@property
|
||||
|
@ -2259,12 +2334,12 @@ class Path:
|
|||
return not self.is_dir()
|
||||
|
||||
def exists(self):
|
||||
return self.at in self._names()
|
||||
return self.at in self.root._name_set()
|
||||
|
||||
def iterdir(self):
|
||||
if not self.is_dir():
|
||||
raise ValueError("Can't listdir a file")
|
||||
subs = map(self._next, self._names())
|
||||
subs = map(self._next, self.root.namelist())
|
||||
return filter(self._is_child, subs)
|
||||
|
||||
def __str__(self):
|
||||
|
@ -2275,25 +2350,10 @@ class Path:
|
|||
|
||||
def joinpath(self, add):
|
||||
next = posixpath.join(self.at, add)
|
||||
next_dir = posixpath.join(self.at, add, "")
|
||||
names = self._names()
|
||||
return self._next(next_dir if next not in names and next_dir in names else next)
|
||||
return self._next(self.root.resolve_dir(next))
|
||||
|
||||
__truediv__ = joinpath
|
||||
|
||||
@staticmethod
|
||||
def _implied_dirs(names):
|
||||
return _unique_everseen(
|
||||
parent + "/"
|
||||
for name in names
|
||||
for parent in _parents(name)
|
||||
if parent + "/" not in names
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _add_implied_dirs(cls, names):
|
||||
return names + list(cls._implied_dirs(names))
|
||||
|
||||
@property
|
||||
def parent(self):
|
||||
parent_at = posixpath.dirname(self.at.rstrip('/'))
|
||||
|
@ -2301,9 +2361,6 @@ class Path:
|
|||
parent_at += '/'
|
||||
return self._next(parent_at)
|
||||
|
||||
def _names(self):
|
||||
return self._add_implied_dirs(self.root.namelist())
|
||||
|
||||
|
||||
def main(args=None):
|
||||
import argparse
|
||||
|
@ -2365,5 +2422,6 @@ def main(args=None):
|
|||
zippath = ''
|
||||
addToZip(zf, path, zippath)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
Improved performance of zipfile.Path for files with a large number of entries. Also improved performance and fixed minor issue as published with `importlib_metadata 1.5 <https://importlib-metadata.readthedocs.io/en/latest/changelog%20(links).html#v1-5-0>`_.
|
Loading…
Reference in New Issue