bpo-39595: Improve zipfile.Path performance (#18406)

* Improve zipfile.Path performance on zipfiles with a large number of entries.

* 📜🤖 Added by blurb_it.

* Add bpo to blurb

* Sync with importlib_metadata 1.5 (6fe70ca)

* Update blurb.

* Remove compatibility code

* Add stubs module, omitted from earlier commit

Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
This commit is contained in:
Jason R. Coombs 2020-02-11 21:58:47 -05:00 committed by GitHub
parent e6be9b59a9
commit e5bd73632e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 254 additions and 68 deletions

View File

@ -391,6 +391,7 @@ class FastPath:
def __init__(self, root):
self.root = root
self.base = os.path.basename(root).lower()
def joinpath(self, child):
return pathlib.Path(self.root, child)
@ -413,12 +414,11 @@ class FastPath:
)
def is_egg(self, search):
root_n_low = os.path.split(self.root)[1].lower()
base = self.base
return (
root_n_low == search.normalized + '.egg'
or root_n_low.startswith(search.prefix)
and root_n_low.endswith('.egg'))
base == search.versionless_egg_name
or base.startswith(search.prefix)
and base.endswith('.egg'))
def search(self, name):
for child in self.children():
@ -439,6 +439,7 @@ class Prepared:
prefix = ''
suffixes = '.dist-info', '.egg-info'
exact_matches = [''][:0]
versionless_egg_name = ''
def __init__(self, name):
self.name = name
@ -448,6 +449,7 @@ class Prepared:
self.prefix = self.normalized + '-'
self.exact_matches = [
self.normalized + suffix for suffix in self.suffixes]
self.versionless_egg_name = self.normalized + '.egg'
class MetadataPathFinder(DistributionFinder):

View File

@ -47,14 +47,28 @@ def tempdir_as_cwd():
yield tmp
class SiteDir:
@contextlib.contextmanager
def install_finder(finder):
sys.meta_path.append(finder)
try:
yield
finally:
sys.meta_path.remove(finder)
class Fixtures:
def setUp(self):
self.fixtures = ExitStack()
self.addCleanup(self.fixtures.close)
class SiteDir(Fixtures):
def setUp(self):
super(SiteDir, self).setUp()
self.site_dir = self.fixtures.enter_context(tempdir())
class OnSysPath:
class OnSysPath(Fixtures):
@staticmethod
@contextlib.contextmanager
def add_sys_path(dir):
@ -198,3 +212,8 @@ def build_files(file_defs, prefix=pathlib.Path()):
def DALS(str):
"Dedent and left-strip"
return textwrap.dedent(str).lstrip()
class NullFinder:
def find_module(self, name):
pass

View File

@ -0,0 +1,10 @@
import unittest
class fake_filesystem_unittest:
"""
Stubbed version of the pyfakefs module
"""
class TestCase(unittest.TestCase):
def setUpPyfakefs(self):
self.skipTest("pyfakefs not available")

View File

@ -7,6 +7,11 @@ import textwrap
import unittest
import importlib.metadata
try:
import pyfakefs.fake_filesystem_unittest as ffs
except ImportError:
from .stubs import fake_filesystem_unittest as ffs
from . import fixtures
from importlib.metadata import (
Distribution, EntryPoint,
@ -185,6 +190,33 @@ class DirectoryTest(fixtures.OnSysPath, fixtures.SiteDir, unittest.TestCase):
version('foo')
class MissingSysPath(fixtures.OnSysPath, unittest.TestCase):
site_dir = '/does-not-exist'
def test_discovery(self):
"""
Discovering distributions should succeed even if
there is an invalid path on sys.path.
"""
importlib.metadata.distributions()
class InaccessibleSysPath(fixtures.OnSysPath, ffs.TestCase):
site_dir = '/access-denied'
def setUp(self):
super(InaccessibleSysPath, self).setUp()
self.setUpPyfakefs()
self.fs.create_dir(self.site_dir, perm_bits=000)
def test_discovery(self):
"""
Discovering distributions should succeed even if
there is an invalid path on sys.path.
"""
list(importlib.metadata.distributions())
class TestEntryPoints(unittest.TestCase):
def __init__(self, *args):
super(TestEntryPoints, self).__init__(*args)

View File

@ -2724,16 +2724,71 @@ class CommandLineTest(unittest.TestCase):
self.assertEqual(f.read(), zf.read(zi))
class TestExecutablePrependedZip(unittest.TestCase):
"""Test our ability to open zip files with an executable prepended."""
def setUp(self):
self.exe_zip = findfile('exe_with_zip', subdir='ziptestdata')
self.exe_zip64 = findfile('exe_with_z64', subdir='ziptestdata')
def _test_zip_works(self, name):
# bpo28494 sanity check: ensure is_zipfile works on these.
self.assertTrue(zipfile.is_zipfile(name),
f'is_zipfile failed on {name}')
# Ensure we can operate on these via ZipFile.
with zipfile.ZipFile(name) as zipfp:
for n in zipfp.namelist():
data = zipfp.read(n)
self.assertIn(b'FAVORITE_NUMBER', data)
def test_read_zip_with_exe_prepended(self):
self._test_zip_works(self.exe_zip)
def test_read_zip64_with_exe_prepended(self):
self._test_zip_works(self.exe_zip64)
@unittest.skipUnless(sys.executable, 'sys.executable required.')
@unittest.skipUnless(os.access('/bin/bash', os.X_OK),
'Test relies on #!/bin/bash working.')
def test_execute_zip2(self):
output = subprocess.check_output([self.exe_zip, sys.executable])
self.assertIn(b'number in executable: 5', output)
@unittest.skipUnless(sys.executable, 'sys.executable required.')
@unittest.skipUnless(os.access('/bin/bash', os.X_OK),
'Test relies on #!/bin/bash working.')
def test_execute_zip64(self):
output = subprocess.check_output([self.exe_zip64, sys.executable])
self.assertIn(b'number in executable: 5', output)
# Poor man's technique to consume a (smallish) iterable.
consume = tuple
# from jaraco.itertools 5.0
class jaraco:
class itertools:
class Counter:
def __init__(self, i):
self.count = 0
self._orig_iter = iter(i)
def __iter__(self):
return self
def __next__(self):
result = next(self._orig_iter)
self.count += 1
return result
def add_dirs(zf):
"""
Given a writable zip file zf, inject directory entries for
any directories implied by the presence of children.
"""
for name in zipfile.Path._implied_dirs(zf.namelist()):
for name in zipfile.CompleteDirs._implied_dirs(zf.namelist()):
zf.writestr(name, b"")
return zf
@ -2774,44 +2829,6 @@ def build_alpharep_fixture():
return zf
class TestExecutablePrependedZip(unittest.TestCase):
"""Test our ability to open zip files with an executable prepended."""
def setUp(self):
self.exe_zip = findfile('exe_with_zip', subdir='ziptestdata')
self.exe_zip64 = findfile('exe_with_z64', subdir='ziptestdata')
def _test_zip_works(self, name):
# bpo-28494 sanity check: ensure is_zipfile works on these.
self.assertTrue(zipfile.is_zipfile(name),
f'is_zipfile failed on {name}')
# Ensure we can operate on these via ZipFile.
with zipfile.ZipFile(name) as zipfp:
for n in zipfp.namelist():
data = zipfp.read(n)
self.assertIn(b'FAVORITE_NUMBER', data)
def test_read_zip_with_exe_prepended(self):
self._test_zip_works(self.exe_zip)
def test_read_zip64_with_exe_prepended(self):
self._test_zip_works(self.exe_zip64)
@unittest.skipUnless(sys.executable, 'sys.executable required.')
@unittest.skipUnless(os.access('/bin/bash', os.X_OK),
'Test relies on #!/bin/bash working.')
def test_execute_zip2(self):
output = subprocess.check_output([self.exe_zip, sys.executable])
self.assertIn(b'number in executable: 5', output)
@unittest.skipUnless(sys.executable, 'sys.executable required.')
@unittest.skipUnless(os.access('/bin/bash', os.X_OK),
'Test relies on #!/bin/bash working.')
def test_execute_zip64(self):
output = subprocess.check_output([self.exe_zip64, sys.executable])
self.assertIn(b'number in executable: 5', output)
class TestPath(unittest.TestCase):
def setUp(self):
self.fixtures = contextlib.ExitStack()
@ -2849,6 +2866,14 @@ class TestPath(unittest.TestCase):
i, = h.iterdir()
assert i.is_file()
def test_subdir_is_dir(self):
for alpharep in self.zipfile_alpharep():
root = zipfile.Path(alpharep)
assert (root / 'b').is_dir()
assert (root / 'b/').is_dir()
assert (root / 'g').is_dir()
assert (root / 'g/').is_dir()
def test_open(self):
for alpharep in self.zipfile_alpharep():
root = zipfile.Path(alpharep)
@ -2910,6 +2935,45 @@ class TestPath(unittest.TestCase):
root = zipfile.Path(alpharep)
assert (root / 'missing dir/').parent.at == ''
def test_mutability(self):
"""
If the underlying zipfile is changed, the Path object should
reflect that change.
"""
for alpharep in self.zipfile_alpharep():
root = zipfile.Path(alpharep)
a, b, g = root.iterdir()
alpharep.writestr('foo.txt', 'foo')
alpharep.writestr('bar/baz.txt', 'baz')
assert any(
child.name == 'foo.txt'
for child in root.iterdir())
assert (root / 'foo.txt').read_text() == 'foo'
baz, = (root / 'bar').iterdir()
assert baz.read_text() == 'baz'
HUGE_ZIPFILE_NUM_ENTRIES = 2 ** 13
def huge_zipfile(self):
"""Create a read-only zipfile with a huge number of entries entries."""
strm = io.BytesIO()
zf = zipfile.ZipFile(strm, "w")
for entry in map(str, range(self.HUGE_ZIPFILE_NUM_ENTRIES)):
zf.writestr(entry, entry)
zf.mode = 'r'
return zf
def test_joinpath_constant_time(self):
"""
Ensure joinpath on items in zipfile is linear time.
"""
root = zipfile.Path(self.huge_zipfile())
entries = jaraco.itertools.Counter(root.iterdir())
for entry in entries:
entry.joinpath('suffix')
# Check the file iterated all items
assert entries.count == self.HUGE_ZIPFILE_NUM_ENTRIES
if __name__ == "__main__":
unittest.main()

View File

@ -16,6 +16,8 @@ import struct
import sys
import threading
import time
import contextlib
from collections import OrderedDict
try:
import zlib # We may need its compression method
@ -2159,6 +2161,79 @@ def _ancestry(path):
path, tail = posixpath.split(path)
class CompleteDirs(ZipFile):
"""
A ZipFile subclass that ensures that implied directories
are always included in the namelist.
"""
@staticmethod
def _implied_dirs(names):
parents = itertools.chain.from_iterable(map(_parents, names))
# Deduplicate entries in original order
implied_dirs = OrderedDict.fromkeys(
p + posixpath.sep for p in parents
# Cast names to a set for O(1) lookups
if p + posixpath.sep not in set(names)
)
return implied_dirs
def namelist(self):
names = super(CompleteDirs, self).namelist()
return names + list(self._implied_dirs(names))
def _name_set(self):
return set(self.namelist())
def resolve_dir(self, name):
"""
If the name represents a directory, return that name
as a directory (with the trailing slash).
"""
names = self._name_set()
dirname = name + '/'
dir_match = name not in names and dirname in names
return dirname if dir_match else name
@classmethod
def make(cls, source):
"""
Given a source (filename or zipfile), return an
appropriate CompleteDirs subclass.
"""
if isinstance(source, CompleteDirs):
return source
if not isinstance(source, ZipFile):
return cls(source)
# Only allow for FastPath when supplied zipfile is read-only
if 'r' not in source.mode:
cls = CompleteDirs
res = cls.__new__(cls)
vars(res).update(vars(source))
return res
class FastLookup(CompleteDirs):
"""
ZipFile subclass to ensure implicit
dirs exist and are resolved rapidly.
"""
def namelist(self):
with contextlib.suppress(AttributeError):
return self.__names
self.__names = super(FastLookup, self).namelist()
return self.__names
def _name_set(self):
with contextlib.suppress(AttributeError):
return self.__lookup
self.__lookup = super(FastLookup, self)._name_set()
return self.__lookup
class Path:
"""
A pathlib-compatible interface for zip files.
@ -2227,7 +2302,7 @@ class Path:
__repr = "{self.__class__.__name__}({self.root.filename!r}, {self.at!r})"
def __init__(self, root, at=""):
self.root = root if isinstance(root, ZipFile) else ZipFile(root)
self.root = FastLookup.make(root)
self.at = at
@property
@ -2259,12 +2334,12 @@ class Path:
return not self.is_dir()
def exists(self):
return self.at in self._names()
return self.at in self.root._name_set()
def iterdir(self):
if not self.is_dir():
raise ValueError("Can't listdir a file")
subs = map(self._next, self._names())
subs = map(self._next, self.root.namelist())
return filter(self._is_child, subs)
def __str__(self):
@ -2275,25 +2350,10 @@ class Path:
def joinpath(self, add):
next = posixpath.join(self.at, add)
next_dir = posixpath.join(self.at, add, "")
names = self._names()
return self._next(next_dir if next not in names and next_dir in names else next)
return self._next(self.root.resolve_dir(next))
__truediv__ = joinpath
@staticmethod
def _implied_dirs(names):
return _unique_everseen(
parent + "/"
for name in names
for parent in _parents(name)
if parent + "/" not in names
)
@classmethod
def _add_implied_dirs(cls, names):
return names + list(cls._implied_dirs(names))
@property
def parent(self):
parent_at = posixpath.dirname(self.at.rstrip('/'))
@ -2301,9 +2361,6 @@ class Path:
parent_at += '/'
return self._next(parent_at)
def _names(self):
return self._add_implied_dirs(self.root.namelist())
def main(args=None):
import argparse
@ -2365,5 +2422,6 @@ def main(args=None):
zippath = ''
addToZip(zf, path, zippath)
if __name__ == "__main__":
main()

View File

@ -0,0 +1 @@
Improved performance of zipfile.Path for files with a large number of entries. Also improved performance and fixed minor issue as published with `importlib_metadata 1.5 <https://importlib-metadata.readthedocs.io/en/latest/changelog%20(links).html#v1-5-0>`_.