From 24af45172f74e4f01eb21d3aee7beab62417b833 Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Tue, 6 Jun 2023 23:50:36 +0100 Subject: [PATCH] GH-102613: Fast recursive globbing in `pathlib.Path.glob()` (GH-104512) This commit introduces a 'walk-and-match' strategy for handling glob patterns that include a non-terminal `**` wildcard, such as `**/*.py`. For this example, the previous implementation recursively walked directories using `os.scandir()` when it expanded the `**` component, and then **scanned those same directories again** when expanded the `*.py` component. This is wasteful. In the new implementation, any components following a `**` wildcard are used to build a `re.Pattern` object, which is used to filter the results of the recursive walk. A pattern like `**/*.py` uses half the number of `os.scandir()` calls; a pattern like `**/*/*.py` a third, etc. This new algorithm does not apply if either: 1. The *follow_symlinks* argument is set to `None` (its default), or 2. The pattern contains `..` components. In these cases we fall back to the old implementation. This commit also replaces selector classes with selector functions. These generators directly yield results rather calling through to their successors. A new internal `Path._glob()` method takes care to chain these generators together, which simplifies the lazy algorithm and slightly improves performance. It should also be easier to understand and maintain. --- Doc/library/pathlib.rst | 12 +- Lib/pathlib.py | 265 +++++++++--------- Lib/test/test_pathlib.py | 18 ++ ...-05-15-18-57-42.gh-issue-102613.YD9yx-.rst | 4 + 4 files changed, 161 insertions(+), 138 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2023-05-15-18-57-42.gh-issue-102613.YD9yx-.rst diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index 67ef36890d5..ad801d5d7cd 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -917,6 +917,14 @@ call fails (for example because the path doesn't exist). PosixPath('setup.py'), PosixPath('test_pathlib.py')] + .. note:: + Using the "``**``" pattern in large directory trees may consume + an inordinate amount of time. + + .. tip:: + Set *follow_symlinks* to ``True`` or ``False`` to improve performance + of recursive globbing. + By default, or when the *case_sensitive* keyword-only argument is set to ``None``, this method matches paths using platform-specific casing rules: typically, case-sensitive on POSIX, and case-insensitive on Windows. @@ -927,10 +935,6 @@ call fails (for example because the path doesn't exist). wildcards. Set *follow_symlinks* to ``True`` to always follow symlinks, or ``False`` to treat all symlinks as files. - .. note:: - Using the "``**``" pattern in large directory trees may consume - an inordinate amount of time. - .. audit-event:: pathlib.Path.glob self,pattern pathlib.Path.glob .. versionchanged:: 3.11 diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 62406473b66..89c7b1ebdf7 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -78,33 +78,12 @@ _SWAP_SEP_AND_NEWLINE = { } -@functools.lru_cache() -def _make_selector(pattern_parts, flavour, case_sensitive): - pat = pattern_parts[0] - if not pat: - return _TerminatingSelector() - if pat == '**': - child_parts_idx = 1 - while child_parts_idx < len(pattern_parts) and pattern_parts[child_parts_idx] == '**': - child_parts_idx += 1 - child_parts = pattern_parts[child_parts_idx:] - if '**' in child_parts: - cls = _DoubleRecursiveWildcardSelector - else: - cls = _RecursiveWildcardSelector - else: - child_parts = pattern_parts[1:] - if pat == '..': - cls = _ParentSelector - elif '**' in pat: - raise ValueError("Invalid pattern: '**' can only be an entire path component") - else: - cls = _WildcardSelector - return cls(pat, child_parts, flavour, case_sensitive) - - @functools.lru_cache(maxsize=256) def _compile_pattern(pat, case_sensitive): + """Compile given glob pattern to a re.Pattern object (observing case + sensitivity), or None if the pattern should match everything.""" + if pat == '*': + return None flags = re.NOFLAG if case_sensitive else re.IGNORECASE return re.compile(fnmatch.translate(pat), flags).match @@ -127,7 +106,11 @@ def _compile_pattern_lines(pattern_lines, case_sensitive): # Match the start of the path, or just after a path separator parts = ['^'] for part in pattern_lines.splitlines(keepends=True): - if part == '**\n': + if part == '*\n': + part = r'.+\n' + elif part == '*': + part = r'.+' + elif part == '**\n': # '**/' component: we use '[\s\S]' rather than '.' so that path # separators (i.e. newlines) are matched. The trailing '^' ensures # we terminate after a path separator (i.e. on a new line). @@ -154,114 +137,70 @@ def _compile_pattern_lines(pattern_lines, case_sensitive): return re.compile(''.join(parts), flags=flags) -class _Selector: - """A selector matches a specific glob pattern part against the children - of a given path.""" - - def __init__(self, child_parts, flavour, case_sensitive): - self.child_parts = child_parts - if child_parts: - self.successor = _make_selector(child_parts, flavour, case_sensitive) - self.dironly = True - else: - self.successor = _TerminatingSelector() - self.dironly = False - - def select_from(self, parent_path, follow_symlinks): - """Iterate over all child paths of `parent_path` matched by this - selector. This can contain parent_path itself.""" - path_cls = type(parent_path) - scandir = path_cls._scandir - if not parent_path.is_dir(): - return iter([]) - return self._select_from(parent_path, scandir, follow_symlinks) - - -class _TerminatingSelector: - - def _select_from(self, parent_path, scandir, follow_symlinks): - yield parent_path - - -class _ParentSelector(_Selector): - - def __init__(self, name, child_parts, flavour, case_sensitive): - _Selector.__init__(self, child_parts, flavour, case_sensitive) - - def _select_from(self, parent_path, scandir, follow_symlinks): - path = parent_path._make_child_relpath('..') - for p in self.successor._select_from(path, scandir, follow_symlinks): - yield p - - -class _WildcardSelector(_Selector): - - def __init__(self, pat, child_parts, flavour, case_sensitive): - _Selector.__init__(self, child_parts, flavour, case_sensitive) - if case_sensitive is None: - # TODO: evaluate case-sensitivity of each directory in _select_from() - case_sensitive = _is_case_sensitive(flavour) - self.match = _compile_pattern(pat, case_sensitive) - - def _select_from(self, parent_path, scandir, follow_symlinks): - follow_dirlinks = True if follow_symlinks is None else follow_symlinks +def _select_children(parent_paths, dir_only, follow_symlinks, match): + """Yield direct children of given paths, filtering by name and type.""" + if follow_symlinks is None: + follow_symlinks = True + for parent_path in parent_paths: try: # We must close the scandir() object before proceeding to # avoid exhausting file descriptors when globbing deep trees. - with scandir(parent_path) as scandir_it: + with parent_path._scandir() as scandir_it: entries = list(scandir_it) except OSError: pass else: for entry in entries: - if self.dironly: + if dir_only: try: - if not entry.is_dir(follow_symlinks=follow_dirlinks): + if not entry.is_dir(follow_symlinks=follow_symlinks): continue except OSError: continue name = entry.name - if self.match(name): - path = parent_path._make_child_relpath(name) - for p in self.successor._select_from(path, scandir, follow_symlinks): - yield p + if match is None or match(name): + yield parent_path._make_child_relpath(name) -class _RecursiveWildcardSelector(_Selector): - - def __init__(self, pat, child_parts, flavour, case_sensitive): - _Selector.__init__(self, child_parts, flavour, case_sensitive) - - def _iterate_directories(self, parent_path, follow_symlinks): - yield parent_path - for dirpath, dirnames, _ in parent_path.walk(follow_symlinks=follow_symlinks): - for dirname in dirnames: - yield dirpath._make_child_relpath(dirname) - - def _select_from(self, parent_path, scandir, follow_symlinks): - follow_dirlinks = False if follow_symlinks is None else follow_symlinks - successor_select = self.successor._select_from - for starting_point in self._iterate_directories(parent_path, follow_dirlinks): - for p in successor_select(starting_point, scandir, follow_symlinks): - yield p +def _select_recursive(parent_paths, dir_only, follow_symlinks): + """Yield given paths and all their subdirectories, recursively.""" + if follow_symlinks is None: + follow_symlinks = False + for parent_path in parent_paths: + paths = [parent_path] + while paths: + path = paths.pop() + yield path + try: + # We must close the scandir() object before proceeding to + # avoid exhausting file descriptors when globbing deep trees. + with path._scandir() as scandir_it: + entries = list(scandir_it) + except OSError: + pass + else: + for entry in entries: + try: + if entry.is_dir(follow_symlinks=follow_symlinks): + paths.append(path._make_child_relpath(entry.name)) + continue + except OSError: + pass + if not dir_only: + yield path._make_child_relpath(entry.name) -class _DoubleRecursiveWildcardSelector(_RecursiveWildcardSelector): - """ - Like _RecursiveWildcardSelector, but also de-duplicates results from - successive selectors. This is necessary if the pattern contains - multiple non-adjacent '**' segments. - """ - - def _select_from(self, parent_path, scandir, follow_symlinks): - yielded = set() - try: - for p in super()._select_from(parent_path, scandir, follow_symlinks): - if p not in yielded: - yield p - yielded.add(p) - finally: - yielded.clear() +def _select_unique(paths): + """Yields the given paths, filtering out duplicates.""" + yielded = set() + try: + for path in paths: + raw_path = path._raw_path + if raw_path not in yielded: + yield path + yielded.add(raw_path) + finally: + yielded.clear() # @@ -1056,19 +995,26 @@ class Path(PurePath): return os.scandir(self) def _make_child_relpath(self, name): + sep = self._flavour.sep + lines_name = name.replace('\n', sep) + lines_str = self._lines path_str = str(self) tail = self._tail if tail: - path_str = f'{path_str}{self._flavour.sep}{name}' + path_str = f'{path_str}{sep}{name}' + lines_str = f'{lines_str}\n{lines_name}' elif path_str != '.': path_str = f'{path_str}{name}' + lines_str = f'{lines_str}{lines_name}' else: path_str = name + lines_str = lines_name path = self.with_segments(path_str) path._str = path_str path._drv = self.drive path._root = self.root path._tail_cached = tail + [name] + path._lines_cached = lines_str return path def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): @@ -1076,16 +1022,7 @@ class Path(PurePath): kind, including directories) matching the given relative pattern. """ sys.audit("pathlib.Path.glob", self, pattern) - if not pattern: - raise ValueError("Unacceptable pattern: {!r}".format(pattern)) - drv, root, pattern_parts = self._parse_path(pattern) - if drv or root: - raise NotImplementedError("Non-relative patterns are unsupported") - if pattern[-1] in (self._flavour.sep, self._flavour.altsep): - pattern_parts.append('') - selector = _make_selector(tuple(pattern_parts), self._flavour, case_sensitive) - for p in selector.select_from(self, follow_symlinks): - yield p + return self._glob(pattern, case_sensitive, follow_symlinks) def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None): """Recursively yield all existing files (of any kind, including @@ -1093,14 +1030,74 @@ class Path(PurePath): this subtree. """ sys.audit("pathlib.Path.rglob", self, pattern) - drv, root, pattern_parts = self._parse_path(pattern) - if drv or root: + return self._glob(f'**/{pattern}', case_sensitive, follow_symlinks) + + def _glob(self, pattern, case_sensitive, follow_symlinks): + path_pattern = self.with_segments(pattern) + if path_pattern.drive or path_pattern.root: raise NotImplementedError("Non-relative patterns are unsupported") - if pattern and pattern[-1] in (self._flavour.sep, self._flavour.altsep): + elif not path_pattern._tail: + raise ValueError("Unacceptable pattern: {!r}".format(pattern)) + + pattern_parts = list(path_pattern._tail) + if pattern[-1] in (self._flavour.sep, self._flavour.altsep): + # GH-65238: pathlib doesn't preserve trailing slash. Add it back. pattern_parts.append('') - selector = _make_selector(("**",) + tuple(pattern_parts), self._flavour, case_sensitive) - for p in selector.select_from(self, follow_symlinks): - yield p + if pattern_parts[-1] == '**': + # GH-70303: '**' only matches directories. Add trailing slash. + pattern_parts.append('') + + if case_sensitive is None: + # TODO: evaluate case-sensitivity of each directory in _select_children(). + case_sensitive = _is_case_sensitive(self._flavour) + + # If symlinks are handled consistently, and the pattern does not + # contain '..' components, then we can use a 'walk-and-match' strategy + # when expanding '**' wildcards. When a '**' wildcard is encountered, + # all following pattern parts are immediately consumed and used to + # build a `re.Pattern` object. This pattern is used to filter the + # recursive walk. As a result, pattern parts following a '**' wildcard + # do not perform any filesystem access, which can be much faster! + filter_paths = follow_symlinks is not None and '..' not in pattern_parts + deduplicate_paths = False + paths = iter([self] if self.is_dir() else []) + part_idx = 0 + while part_idx < len(pattern_parts): + part = pattern_parts[part_idx] + part_idx += 1 + if part == '': + # Trailing slash. + pass + elif part == '..': + paths = (path._make_child_relpath('..') for path in paths) + elif part == '**': + # Consume adjacent '**' components. + while part_idx < len(pattern_parts) and pattern_parts[part_idx] == '**': + part_idx += 1 + + if filter_paths and part_idx < len(pattern_parts) and pattern_parts[part_idx] != '': + dir_only = pattern_parts[-1] == '' + paths = _select_recursive(paths, dir_only, follow_symlinks) + + # Filter out paths that don't match pattern. + prefix_len = len(self._make_child_relpath('_')._lines) - 1 + match = _compile_pattern_lines(path_pattern._lines, case_sensitive).match + paths = (path for path in paths if match(path._lines[prefix_len:])) + return paths + + dir_only = part_idx < len(pattern_parts) + paths = _select_recursive(paths, dir_only, follow_symlinks) + if deduplicate_paths: + # De-duplicate if we've already seen a '**' component. + paths = _select_unique(paths) + deduplicate_paths = True + elif '**' in part: + raise ValueError("Invalid pattern: '**' can only be an entire path component") + else: + dir_only = part_idx < len(pattern_parts) + match = _compile_pattern(part, case_sensitive) + paths = _select_children(paths, dir_only, follow_symlinks, match) + return paths def walk(self, top_down=True, on_error=None, follow_symlinks=False): """Walk the directory tree from this directory, similar to os.walk().""" diff --git a/Lib/test/test_pathlib.py b/Lib/test/test_pathlib.py index da5197dbfac..1a008e5cea3 100644 --- a/Lib/test/test_pathlib.py +++ b/Lib/test/test_pathlib.py @@ -1898,6 +1898,16 @@ class _BasePathTest(object): _check(p, "*B/*", ["dirB/fileB", "dirB/linkD", "linkB/fileB", "linkB/linkD"]) _check(p, "*/fileB", ["dirB/fileB", "linkB/fileB"]) _check(p, "*/", ["dirA", "dirB", "dirC", "dirE", "linkB"]) + _check(p, "dir*/*/..", ["dirC/dirD/..", "dirA/linkC/.."]) + _check(p, "dir*/**/", ["dirA", "dirA/linkC", "dirA/linkC/linkD", "dirB", "dirB/linkD", + "dirC", "dirC/dirD", "dirE"]) + _check(p, "dir*/**/..", ["dirA/..", "dirA/linkC/..", "dirB/..", + "dirC/..", "dirC/dirD/..", "dirE/.."]) + _check(p, "dir*/*/**/", ["dirA/linkC", "dirA/linkC/linkD", "dirB/linkD", "dirC/dirD"]) + _check(p, "dir*/*/**/..", ["dirA/linkC/..", "dirC/dirD/.."]) + _check(p, "dir*/**/fileC", ["dirC/fileC"]) + _check(p, "dir*/*/../dirD/**/", ["dirC/dirD/../dirD"]) + _check(p, "*/dirD/**/", ["dirC/dirD"]) @os_helper.skip_unless_symlink def test_glob_no_follow_symlinks_common(self): @@ -1912,6 +1922,14 @@ class _BasePathTest(object): _check(p, "*B/*", ["dirB/fileB", "dirB/linkD"]) _check(p, "*/fileB", ["dirB/fileB"]) _check(p, "*/", ["dirA", "dirB", "dirC", "dirE"]) + _check(p, "dir*/*/..", ["dirC/dirD/.."]) + _check(p, "dir*/**/", ["dirA", "dirB", "dirC", "dirC/dirD", "dirE"]) + _check(p, "dir*/**/..", ["dirA/..", "dirB/..", "dirC/..", "dirC/dirD/..", "dirE/.."]) + _check(p, "dir*/*/**/", ["dirC/dirD"]) + _check(p, "dir*/*/**/..", ["dirC/dirD/.."]) + _check(p, "dir*/**/fileC", ["dirC/fileC"]) + _check(p, "dir*/*/../dirD/**/", ["dirC/dirD/../dirD"]) + _check(p, "*/dirD/**/", ["dirC/dirD"]) def test_rglob_common(self): def _check(glob, expected): diff --git a/Misc/NEWS.d/next/Library/2023-05-15-18-57-42.gh-issue-102613.YD9yx-.rst b/Misc/NEWS.d/next/Library/2023-05-15-18-57-42.gh-issue-102613.YD9yx-.rst new file mode 100644 index 00000000000..841a9e1e26d --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-05-15-18-57-42.gh-issue-102613.YD9yx-.rst @@ -0,0 +1,4 @@ +Improve performance of :meth:`pathlib.Path.glob` when expanding a pattern with +a non-terminal "``**``" component by filtering walked paths through a regular +expression, rather than calling :func:`os.scandir` more than once on each +directory.