bpo-39667: Sync zipp 3.0 (GH-18540)

* bpo-39667: Improve pathlib.Path compatibility on zipfile.Path and correct performance degradation as found in zipp 3.0 * 📜🤖 Added by blurb_it. * Update docs for new zipfile.Path.open * Rely on dict, faster than OrderedDict. * Syntax edits on docs Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
2020-02-29 10:34:11 -06:00 · 2020-02-29 10:34:11 -06:00 · 0aeab5c438
parent 1f0cd3c61a
commit 0aeab5c438
4 changed files with 52 additions and 36 deletions
--- a/Doc/library/zipfile.rst
+++ b/Doc/library/zipfile.rst
@ -489,10 +489,20 @@ Path objects are traversable using the ``/`` operator.

   The final path component.

-.. method:: Path.open(*, **)
+.. method:: Path.open(mode='r', *, pwd, **)

-   Invoke :meth:`ZipFile.open` on the current path. Accepts
-   the same arguments as :meth:`ZipFile.open`.
+   Invoke :meth:`ZipFile.open` on the current path.
+   Allows opening for read or write, text or binary
+   through supported modes: 'r', 'w', 'rb', 'wb'.
+   Positional and keyword arguments are passed through to
+   :class:`io.TextIOWrapper` when opened as text and
+   ignored otherwise.
+   ``pwd`` is the ``pwd`` parameter to
+   :meth:`ZipFile.open`.
+
+   .. versionchanged:: 3.9
+      Added support for text and binary modes for open. Default
+      mode is now text.

 .. method:: Path.iterdir()

--- a/Lib/test/test_zipfile.py
+++ b/Lib/test/test_zipfile.py
@ -5,6 +5,7 @@ import itertools
 import os
 import pathlib
 import posixpath
+import string
 import struct
 import subprocess
 import sys
@ -2880,7 +2881,7 @@ class TestPath(unittest.TestCase):
            a, b, g = root.iterdir()
            with a.open() as strm:
                data = strm.read()
-            assert data == b"content of a"
+            assert data == "content of a"

    def test_read(self):
        for alpharep in self.zipfile_alpharep():
@ -2974,6 +2975,11 @@ class TestPath(unittest.TestCase):
        # Check the file iterated all items
        assert entries.count == self.HUGE_ZIPFILE_NUM_ENTRIES

+    # @func_timeout.func_set_timeout(3)
+    def test_implied_dirs_performance(self):
+        data = ['/'.join(string.ascii_lowercase + str(n)) for n in range(10000)]
+        zipfile.CompleteDirs._implied_dirs(data)
+

 if __name__ == "__main__":
    unittest.main()
--- a/Lib/zipfile.py
+++ b/Lib/zipfile.py
@ -17,7 +17,6 @@ import sys
 import threading
 import time
 import contextlib
-from collections import OrderedDict

 try:
    import zlib # We may need its compression method
@ -2102,24 +2101,6 @@ class PyZipFile(ZipFile):
        return (fname, archivename)


-def _unique_everseen(iterable, key=None):
-    "List unique elements, preserving order. Remember all elements ever seen."
-    # unique_everseen('AAAABBBCCDAABBB') --> A B C D
-    # unique_everseen('ABBCcAD', str.lower) --> A B C D
-    seen = set()
-    seen_add = seen.add
-    if key is None:
-        for element in itertools.filterfalse(seen.__contains__, iterable):
-            seen_add(element)
-            yield element
-    else:
-        for element in iterable:
-            k = key(element)
-            if k not in seen:
-                seen_add(k)
-                yield element
-
-
 def _parents(path):
    """
    Given a path with elements separated by
@ -2161,6 +2142,18 @@ def _ancestry(path):
        path, tail = posixpath.split(path)


+_dedupe = dict.fromkeys
+"""Deduplicate an iterable in original order"""
+
+
+def _difference(minuend, subtrahend):
+    """
+    Return items in minuend not in subtrahend, retaining order
+    with O(1) lookup.
+    """
+    return itertools.filterfalse(set(subtrahend).__contains__, minuend)
+
+
 class CompleteDirs(ZipFile):
    """
    A ZipFile subclass that ensures that implied directories
@ -2170,13 +2163,8 @@ class CompleteDirs(ZipFile):
    @staticmethod
    def _implied_dirs(names):
        parents = itertools.chain.from_iterable(map(_parents, names))
-        # Deduplicate entries in original order
-        implied_dirs = OrderedDict.fromkeys(
-            p + posixpath.sep for p in parents
-            # Cast names to a set for O(1) lookups
-            if p + posixpath.sep not in set(names)
-        )
-        return implied_dirs
+        as_dirs = (p + posixpath.sep for p in parents)
+        return _dedupe(_difference(as_dirs, names))

    def namelist(self):
        names = super(CompleteDirs, self).namelist()
@ -2305,20 +2293,31 @@ class Path:
        self.root = FastLookup.make(root)
        self.at = at

-    @property
-    def open(self):
-        return functools.partial(self.root.open, self.at)
+    def open(self, mode='r', *args, **kwargs):
+        """
+        Open this entry as text or binary following the semantics
+        of ``pathlib.Path.open()`` by passing arguments through
+        to io.TextIOWrapper().
+        """
+        pwd = kwargs.pop('pwd', None)
+        zip_mode = mode[0]
+        stream = self.root.open(self.at, zip_mode, pwd=pwd)
+        if 'b' in mode:
+            if args or kwargs:
+                raise ValueError("encoding args invalid for binary operation")
+            return stream
+        return io.TextIOWrapper(stream, *args, **kwargs)

    @property
    def name(self):
        return posixpath.basename(self.at.rstrip("/"))

    def read_text(self, *args, **kwargs):
-        with self.open() as strm:
-            return io.TextIOWrapper(strm, *args, **kwargs).read()
+        with self.open('r', *args, **kwargs) as strm:
+            return strm.read()

    def read_bytes(self):
-        with self.open() as strm:
+        with self.open('rb') as strm:
            return strm.read()

    def _is_child(self, path):
--- a/Misc/NEWS.d/next/Library/2020-02-17-22-38-15.bpo-39667.QuzEHH.rst
+++ b/Misc/NEWS.d/next/Library/2020-02-17-22-38-15.bpo-39667.QuzEHH.rst
@ -0,0 +1 @@
+Improve pathlib.Path compatibility on zipfile.Path and correct performance degradation as found in zipp 3.0.
				`@ -0,0 +1 @@`
				`Improve pathlib.Path compatibility on zipfile.Path and correct performance degradation as found in zipp 3.0.`