From 1fb9bd222bfe96cdf8a82701a3192e45d0811555 Mon Sep 17 00:00:00 2001 From: Desmond Cheong Date: Fri, 7 Jul 2023 15:02:13 -0700 Subject: [PATCH] gh-103200: Fix performance issues with `zipimport.invalidate_caches()` (GH-103208) Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com> Co-authored-by: Brett Cannon --- Lib/test/test_zipimport.py | 45 ++++++++++++++++-- Lib/zipimport.py | 46 ++++++++++--------- ...-04-03-08-09-40.gh-issue-103200.lq1Etz.rst | 1 + 3 files changed, 67 insertions(+), 25 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2023-04-03-08-09-40.gh-issue-103200.lq1Etz.rst diff --git a/Lib/test/test_zipimport.py b/Lib/test/test_zipimport.py index 14c19719e26..c12798d221e 100644 --- a/Lib/test/test_zipimport.py +++ b/Lib/test/test_zipimport.py @@ -520,10 +520,10 @@ class UncompressedZipImportTestCase(ImportHooksBaseTestCase): z.writestr(zinfo, data) zi = zipimport.zipimporter(TEMP_ZIP) - self.assertEqual(zi._files.keys(), files.keys()) + self.assertEqual(zi._get_files().keys(), files.keys()) # Check that the file information remains accurate after reloading zi.invalidate_caches() - self.assertEqual(zi._files.keys(), files.keys()) + self.assertEqual(zi._get_files().keys(), files.keys()) # Add a new file to the ZIP archive newfile = {"spam2" + pyc_ext: (NOW, test_pyc)} files.update(newfile) @@ -535,17 +535,54 @@ class UncompressedZipImportTestCase(ImportHooksBaseTestCase): z.writestr(zinfo, data) # Check that we can detect the new file after invalidating the cache zi.invalidate_caches() - self.assertEqual(zi._files.keys(), files.keys()) + self.assertEqual(zi._get_files().keys(), files.keys()) spec = zi.find_spec('spam2') self.assertIsNotNone(spec) self.assertIsInstance(spec.loader, zipimport.zipimporter) # Check that the cached data is removed if the file is deleted os.remove(TEMP_ZIP) zi.invalidate_caches() - self.assertFalse(zi._files) + self.assertFalse(zi._get_files()) self.assertIsNone(zipimport._zip_directory_cache.get(zi.archive)) self.assertIsNone(zi.find_spec("name_does_not_matter")) + def testInvalidateCachesWithMultipleZipimports(self): + packdir = TESTPACK + os.sep + packdir2 = packdir + TESTPACK2 + os.sep + files = {packdir + "__init__" + pyc_ext: (NOW, test_pyc), + packdir2 + "__init__" + pyc_ext: (NOW, test_pyc), + packdir2 + TESTMOD + pyc_ext: (NOW, test_pyc), + "spam" + pyc_ext: (NOW, test_pyc)} + self.addCleanup(os_helper.unlink, TEMP_ZIP) + with ZipFile(TEMP_ZIP, "w") as z: + for name, (mtime, data) in files.items(): + zinfo = ZipInfo(name, time.localtime(mtime)) + zinfo.compress_type = self.compression + zinfo.comment = b"spam" + z.writestr(zinfo, data) + + zi = zipimport.zipimporter(TEMP_ZIP) + self.assertEqual(zi._get_files().keys(), files.keys()) + # Zipimporter for the same path. + zi2 = zipimport.zipimporter(TEMP_ZIP) + self.assertEqual(zi2._get_files().keys(), files.keys()) + # Add a new file to the ZIP archive to make the cache wrong. + newfile = {"spam2" + pyc_ext: (NOW, test_pyc)} + files.update(newfile) + with ZipFile(TEMP_ZIP, "a") as z: + for name, (mtime, data) in newfile.items(): + zinfo = ZipInfo(name, time.localtime(mtime)) + zinfo.compress_type = self.compression + zinfo.comment = b"spam" + z.writestr(zinfo, data) + # Invalidate the cache of the first zipimporter. + zi.invalidate_caches() + # Check that the second zipimporter detects the new file and isn't using a stale cache. + self.assertEqual(zi2._get_files().keys(), files.keys()) + spec = zi2.find_spec('spam2') + self.assertIsNotNone(spec) + self.assertIsInstance(spec.loader, zipimport.zipimporter) + def testZipImporterMethodsInSubDirectory(self): packdir = TESTPACK + os.sep packdir2 = packdir + TESTPACK2 + os.sep diff --git a/Lib/zipimport.py b/Lib/zipimport.py index a7333a4c490..5b9f614f02f 100644 --- a/Lib/zipimport.py +++ b/Lib/zipimport.py @@ -88,12 +88,8 @@ class zipimporter(_bootstrap_external._LoaderBasics): raise ZipImportError('not a Zip file', path=path) break - try: - files = _zip_directory_cache[path] - except KeyError: - files = _read_directory(path) - _zip_directory_cache[path] = files - self._files = files + if path not in _zip_directory_cache: + _zip_directory_cache[path] = _read_directory(path) self.archive = path # a prefix directory following the ZIP file path. self.prefix = _bootstrap_external._path_join(*prefix[::-1]) @@ -152,7 +148,7 @@ class zipimporter(_bootstrap_external._LoaderBasics): key = pathname[len(self.archive + path_sep):] try: - toc_entry = self._files[key] + toc_entry = self._get_files()[key] except KeyError: raise OSError(0, '', key) return _get_data(self.archive, toc_entry) @@ -189,7 +185,7 @@ class zipimporter(_bootstrap_external._LoaderBasics): fullpath = f'{path}.py' try: - toc_entry = self._files[fullpath] + toc_entry = self._get_files()[fullpath] except KeyError: # we have the module, but no source return None @@ -268,14 +264,22 @@ class zipimporter(_bootstrap_external._LoaderBasics): return ZipReader(self, fullname) - def invalidate_caches(self): - """Reload the file data of the archive path.""" + def _get_files(self): + """Return the files within the archive path.""" try: - self._files = _read_directory(self.archive) - _zip_directory_cache[self.archive] = self._files - except ZipImportError: - _zip_directory_cache.pop(self.archive, None) - self._files = {} + files = _zip_directory_cache[self.archive] + except KeyError: + try: + files = _zip_directory_cache[self.archive] = _read_directory(self.archive) + except ZipImportError: + files = {} + + return files + + + def invalidate_caches(self): + """Invalidates the cache of file data of the archive path.""" + _zip_directory_cache.pop(self.archive, None) def __repr__(self): @@ -305,15 +309,15 @@ def _is_dir(self, path): # of a namespace package. We test by seeing if the name, with an # appended path separator, exists. dirpath = path + path_sep - # If dirpath is present in self._files, we have a directory. - return dirpath in self._files + # If dirpath is present in self._get_files(), we have a directory. + return dirpath in self._get_files() # Return some information about a module. def _get_module_info(self, fullname): path = _get_module_path(self, fullname) for suffix, isbytecode, ispackage in _zip_searchorder: fullpath = path + suffix - if fullpath in self._files: + if fullpath in self._get_files(): return ispackage return None @@ -656,7 +660,7 @@ def _get_mtime_and_size_of_source(self, path): # strip 'c' or 'o' from *.py[co] assert path[-1:] in ('c', 'o') path = path[:-1] - toc_entry = self._files[path] + toc_entry = self._get_files()[path] # fetch the time stamp of the .py file for comparison # with an embedded pyc time stamp time = toc_entry[5] @@ -676,7 +680,7 @@ def _get_pyc_source(self, path): path = path[:-1] try: - toc_entry = self._files[path] + toc_entry = self._get_files()[path] except KeyError: return None else: @@ -692,7 +696,7 @@ def _get_module_code(self, fullname): fullpath = path + suffix _bootstrap._verbose_message('trying {}{}{}', self.archive, path_sep, fullpath, verbosity=2) try: - toc_entry = self._files[fullpath] + toc_entry = self._get_files()[fullpath] except KeyError: pass else: diff --git a/Misc/NEWS.d/next/Library/2023-04-03-08-09-40.gh-issue-103200.lq1Etz.rst b/Misc/NEWS.d/next/Library/2023-04-03-08-09-40.gh-issue-103200.lq1Etz.rst new file mode 100644 index 00000000000..e264e0c81d3 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-04-03-08-09-40.gh-issue-103200.lq1Etz.rst @@ -0,0 +1 @@ +Fix cache repopulation semantics of zipimport.invalidate_caches(). The cache is now repopulated upon retrieving files with an invalid cache, not when the cache is invalidated.