From cf67ebfb315ce36175f3d425249d7c6560f6d0d5 Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Mon, 13 Nov 2023 17:15:56 +0000 Subject: [PATCH] GH-72904: Add `glob.translate()` function (#106703) Add `glob.translate()` function that converts a pathname with shell wildcards to a regular expression. The regular expression is used by pathlib to implement `match()` and `glob()`. This function differs from `fnmatch.translate()` in that wildcards do not match path separators by default, and that a `*` pattern segment matches precisely one path segment. When *recursive* is set to true, `**` pattern segments match any number of path segments, and `**` cannot appear outside its own segment. In pathlib, this change speeds up directory walking (because `_make_child_relpath()` does less work), makes path objects smaller (they don't need a `_lines` slot), and removes the need for some gnarly code. Co-authored-by: Jason R. Coombs Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com> --- Doc/library/glob.rst | 39 ++++++ Doc/whatsnew/3.13.rst | 7 + Lib/fnmatch.py | 11 +- Lib/glob.py | 60 +++++++++ Lib/pathlib.py | 125 +++--------------- Lib/test/test_glob.py | 91 +++++++++++++ ...3-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst | 2 + 7 files changed, 229 insertions(+), 106 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst diff --git a/Doc/library/glob.rst b/Doc/library/glob.rst index 0e4cfe7ebed..8e76d2d5f16 100644 --- a/Doc/library/glob.rst +++ b/Doc/library/glob.rst @@ -145,6 +145,45 @@ default. For example, consider a directory containing :file:`card.gif` and >>> glob.glob('.c*') ['.card.gif'] + +.. function:: translate(pathname, *, recursive=False, include_hidden=False, seps=None) + + Convert the given path specification to a regular expression for use with + :func:`re.match`. The path specification can contain shell-style wildcards. + + For example: + + >>> import glob, re + >>> + >>> regex = glob.translate('**/*.txt', recursive=True, include_hidden=True) + >>> regex + '(?s:(?:.+/)?[^/]*\\.txt)\\Z' + >>> reobj = re.compile(regex) + >>> reobj.match('foo/bar/baz.txt') + + + Path separators and segments are meaningful to this function, unlike + :func:`fnmatch.translate`. By default wildcards do not match path + separators, and ``*`` pattern segments match precisely one path segment. + + If *recursive* is true, the pattern segment "``**``" will match any number + of path segments. If "``**``" occurs in any position other than a full + pattern segment, :exc:`ValueError` is raised. + + If *include_hidden* is true, wildcards can match path segments that start + with a dot (``.``). + + A sequence of path separators may be supplied to the *seps* argument. If + not given, :data:`os.sep` and :data:`~os.altsep` (if available) are used. + + .. seealso:: + + :meth:`pathlib.PurePath.match` and :meth:`pathlib.Path.glob` methods, + which call this function to implement pattern matching and globbing. + + .. versionadded:: 3.13 + + .. seealso:: Module :mod:`fnmatch` diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 9f9239a7eeb..81e133bb545 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -183,6 +183,13 @@ doctest :attr:`doctest.TestResults.skipped` attributes. (Contributed by Victor Stinner in :gh:`108794`.) +glob +---- + +* Add :func:`glob.translate` function that converts a path specification with + shell-style wildcards to a regular expression. + (Contributed by Barney Gale in :gh:`72904`.) + io -- diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index d5e296f7748..73acb1fe8d4 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -78,6 +78,11 @@ def translate(pat): """ STAR = object() + parts = _translate(pat, STAR, '.') + return _join_translated_parts(parts, STAR) + + +def _translate(pat, STAR, QUESTION_MARK): res = [] add = res.append i, n = 0, len(pat) @@ -89,7 +94,7 @@ def translate(pat): if (not res) or res[-1] is not STAR: add(STAR) elif c == '?': - add('.') + add(QUESTION_MARK) elif c == '[': j = i if j < n and pat[j] == '!': @@ -146,9 +151,11 @@ def translate(pat): else: add(re.escape(c)) assert i == n + return res + +def _join_translated_parts(inp, STAR): # Deal with STARs. - inp = res res = [] add = res.append i, n = 0, len(inp) diff --git a/Lib/glob.py b/Lib/glob.py index a7256422d52..4a335a10766 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -249,3 +249,63 @@ def escape(pathname): _dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0) + + +def translate(pat, *, recursive=False, include_hidden=False, seps=None): + """Translate a pathname with shell wildcards to a regular expression. + + If `recursive` is true, the pattern segment '**' will match any number of + path segments; if '**' appears outside its own segment, ValueError will be + raised. + + If `include_hidden` is true, wildcards can match path segments beginning + with a dot ('.'). + + If a sequence of separator characters is given to `seps`, they will be + used to split the pattern into segments and match path separators. If not + given, os.path.sep and os.path.altsep (where available) are used. + """ + if not seps: + if os.path.altsep: + seps = (os.path.sep, os.path.altsep) + else: + seps = os.path.sep + escaped_seps = ''.join(map(re.escape, seps)) + any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps + not_sep = f'[^{escaped_seps}]' + if include_hidden: + one_last_segment = f'{not_sep}+' + one_segment = f'{one_last_segment}{any_sep}' + any_segments = f'(?:.+{any_sep})?' + any_last_segments = '.*' + else: + one_last_segment = f'[^{escaped_seps}.]{not_sep}*' + one_segment = f'{one_last_segment}{any_sep}' + any_segments = f'(?:{one_segment})*' + any_last_segments = f'{any_segments}(?:{one_last_segment})?' + + results = [] + parts = re.split(any_sep, pat) + last_part_idx = len(parts) - 1 + for idx, part in enumerate(parts): + if part == '*': + results.append(one_segment if idx < last_part_idx else one_last_segment) + continue + if recursive: + if part == '**': + if idx < last_part_idx: + if parts[idx + 1] != '**': + results.append(any_segments) + else: + results.append(any_last_segments) + continue + elif '**' in part: + raise ValueError("Invalid pattern: '**' can only be an entire path component") + if part: + if not include_hidden and part[0] in '*?': + results.append(r'(?!\.)') + results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep)) + if idx < last_part_idx: + results.append(any_sep) + res = ''.join(results) + return fr'(?s:{res})\Z' diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 47a043c5e6b..c06ea5c9bf1 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -6,8 +6,8 @@ operating systems. """ import contextlib -import fnmatch import functools +import glob import io import ntpath import os @@ -76,78 +76,16 @@ def _is_case_sensitive(pathmod): # -# fnmatch.translate() returns a regular expression that includes a prefix and -# a suffix, which enable matching newlines and ensure the end of the string is -# matched, respectively. These features are undesirable for our implementation -# of PurePatch.match(), which represents path separators as newlines and joins -# pattern segments together. As a workaround, we define a slice object that -# can remove the prefix and suffix from any translate() result. See the -# _compile_pattern_lines() function for more details. -_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_') -_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX)) -_SWAP_SEP_AND_NEWLINE = { - '/': str.maketrans({'/': '\n', '\n': '/'}), - '\\': str.maketrans({'\\': '\n', '\n': '\\'}), -} - - @functools.lru_cache(maxsize=256) -def _compile_pattern(pat, case_sensitive): +def _compile_pattern(pat, sep, case_sensitive): """Compile given glob pattern to a re.Pattern object (observing case - sensitivity), or None if the pattern should match everything.""" - if pat == '*': - return None + sensitivity).""" flags = re.NOFLAG if case_sensitive else re.IGNORECASE - return re.compile(fnmatch.translate(pat), flags).match - - -@functools.lru_cache() -def _compile_pattern_lines(pattern_lines, case_sensitive): - """Compile the given pattern lines to an `re.Pattern` object. - - The *pattern_lines* argument is a glob-style pattern (e.g. '**/*.py') with - its path separators and newlines swapped (e.g. '**\n*.py`). By using - newlines to separate path components, and not setting `re.DOTALL`, we - ensure that the `*` wildcard cannot match path separators. - - The returned `re.Pattern` object may have its `match()` method called to - match a complete pattern, or `search()` to match from the right. The - argument supplied to these methods must also have its path separators and - newlines swapped. - """ - - # Match the start of the path, or just after a path separator - parts = ['^'] - for part in pattern_lines.splitlines(keepends=True): - if part == '*\n': - part = r'.+\n' - elif part == '*': - part = r'.+' - elif part == '**\n': - # '**/' component: we use '(?s:.)' rather than '.' so that path - # separators (i.e. newlines) are matched. The trailing '^' ensures - # we terminate after a path separator (i.e. on a new line). - part = r'(?s:.)*^' - elif part == '**': - # '**' component. - part = r'(?s:.)*' - elif '**' in part: - raise ValueError("Invalid pattern: '**' can only be an entire path component") - else: - # Any other component: pass to fnmatch.translate(). We slice off - # the common prefix and suffix added by translate() to ensure that - # re.DOTALL is not set, and the end of the string not matched, - # respectively. With DOTALL not set, '*' wildcards will not match - # path separators, because the '.' characters in the pattern will - # not match newlines. - part = fnmatch.translate(part)[_FNMATCH_SLICE] - parts.append(part) - # Match the end of the path, always. - parts.append(r'\Z') - flags = re.MULTILINE - if not case_sensitive: - flags |= re.IGNORECASE - return re.compile(''.join(parts), flags=flags) + regex = glob.translate(pat, recursive=True, include_hidden=True, seps=sep) + # The string representation of an empty path is a single dot ('.'). Empty + # paths shouldn't match wildcards, so we consume it with an atomic group. + regex = r'(\.\Z)?+' + regex + return re.compile(regex, flags).match def _select_children(parent_paths, dir_only, follow_symlinks, match): @@ -171,7 +109,7 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match): except OSError: continue name = entry.name - if match is None or match(name): + if match(name): yield parent_path._make_child_relpath(name) @@ -297,10 +235,6 @@ class PurePath: # to implement comparison methods like `__lt__()`. '_parts_normcase_cached', - # The `_lines_cached` slot stores the string path with path separators - # and newlines swapped. This is used to implement `match()`. - '_lines_cached', - # The `_hash` slot stores the hash of the case-normalized string # path. It's set when `__hash__()` is called for the first time. '_hash', @@ -475,20 +409,6 @@ class PurePath: self._parts_normcase_cached = self._str_normcase.split(self.pathmod.sep) return self._parts_normcase_cached - @property - def _lines(self): - # Path with separators and newlines swapped, for pattern matching. - try: - return self._lines_cached - except AttributeError: - path_str = str(self) - if path_str == '.': - self._lines_cached = '' - else: - trans = _SWAP_SEP_AND_NEWLINE[self.pathmod.sep] - self._lines_cached = path_str.translate(trans) - return self._lines_cached - def __eq__(self, other): if not isinstance(other, PurePath): return NotImplemented @@ -763,13 +683,16 @@ class PurePath: path_pattern = self.with_segments(path_pattern) if case_sensitive is None: case_sensitive = _is_case_sensitive(self.pathmod) - pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive) + sep = path_pattern.pathmod.sep + pattern_str = str(path_pattern) if path_pattern.drive or path_pattern.root: - return pattern.match(self._lines) is not None + pass elif path_pattern._tail: - return pattern.search(self._lines) is not None + pattern_str = f'**{sep}{pattern_str}' else: raise ValueError("empty pattern") + match = _compile_pattern(pattern_str, sep, case_sensitive) + return match(str(self)) is not None # Subclassing os.PathLike makes isinstance() checks slower, @@ -1069,26 +992,19 @@ class _PathBase(PurePath): return contextlib.nullcontext(self.iterdir()) def _make_child_relpath(self, name): - sep = self.pathmod.sep - lines_name = name.replace('\n', sep) - lines_str = self._lines path_str = str(self) tail = self._tail if tail: - path_str = f'{path_str}{sep}{name}' - lines_str = f'{lines_str}\n{lines_name}' + path_str = f'{path_str}{self.pathmod.sep}{name}' elif path_str != '.': path_str = f'{path_str}{name}' - lines_str = f'{lines_str}{lines_name}' else: path_str = name - lines_str = lines_name path = self.with_segments(path_str) path._str = path_str path._drv = self.drive path._root = self.root path._tail_cached = tail + [name] - path._lines_cached = lines_str return path def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): @@ -1139,6 +1055,7 @@ class _PathBase(PurePath): # do not perform any filesystem access, which can be much faster! filter_paths = follow_symlinks is not None and '..' not in pattern_parts deduplicate_paths = False + sep = self.pathmod.sep paths = iter([self] if self.is_dir() else []) part_idx = 0 while part_idx < len(pattern_parts): @@ -1159,9 +1076,9 @@ class _PathBase(PurePath): paths = _select_recursive(paths, dir_only, follow_symlinks) # Filter out paths that don't match pattern. - prefix_len = len(self._make_child_relpath('_')._lines) - 1 - match = _compile_pattern_lines(path_pattern._lines, case_sensitive).match - paths = (path for path in paths if match(path._lines[prefix_len:])) + prefix_len = len(str(self._make_child_relpath('_'))) - 1 + match = _compile_pattern(str(path_pattern), sep, case_sensitive) + paths = (path for path in paths if match(str(path), prefix_len)) return paths dir_only = part_idx < len(pattern_parts) @@ -1174,7 +1091,7 @@ class _PathBase(PurePath): raise ValueError("Invalid pattern: '**' can only be an entire path component") else: dir_only = part_idx < len(pattern_parts) - match = _compile_pattern(part, case_sensitive) + match = _compile_pattern(part, sep, case_sensitive) paths = _select_children(paths, dir_only, follow_symlinks, match) return paths diff --git a/Lib/test/test_glob.py b/Lib/test/test_glob.py index f4b5821f408..aa5fac8eca1 100644 --- a/Lib/test/test_glob.py +++ b/Lib/test/test_glob.py @@ -1,5 +1,6 @@ import glob import os +import re import shutil import sys import unittest @@ -349,6 +350,96 @@ class GlobTests(unittest.TestCase): for it in iters: self.assertEqual(next(it), p) + def test_translate_matching(self): + match = re.compile(glob.translate('*')).match + self.assertIsNotNone(match('foo')) + self.assertIsNotNone(match('foo.bar')) + self.assertIsNone(match('.foo')) + match = re.compile(glob.translate('.*')).match + self.assertIsNotNone(match('.foo')) + match = re.compile(glob.translate('**', recursive=True)).match + self.assertIsNotNone(match('foo')) + self.assertIsNone(match('.foo')) + self.assertIsNotNone(match(os.path.join('foo', 'bar'))) + self.assertIsNone(match(os.path.join('foo', '.bar'))) + self.assertIsNone(match(os.path.join('.foo', 'bar'))) + self.assertIsNone(match(os.path.join('.foo', '.bar'))) + match = re.compile(glob.translate('**/*', recursive=True)).match + self.assertIsNotNone(match(os.path.join('foo', 'bar'))) + self.assertIsNone(match(os.path.join('foo', '.bar'))) + self.assertIsNone(match(os.path.join('.foo', 'bar'))) + self.assertIsNone(match(os.path.join('.foo', '.bar'))) + match = re.compile(glob.translate('*/**', recursive=True)).match + self.assertIsNotNone(match(os.path.join('foo', 'bar'))) + self.assertIsNone(match(os.path.join('foo', '.bar'))) + self.assertIsNone(match(os.path.join('.foo', 'bar'))) + self.assertIsNone(match(os.path.join('.foo', '.bar'))) + match = re.compile(glob.translate('**/.bar', recursive=True)).match + self.assertIsNotNone(match(os.path.join('foo', '.bar'))) + self.assertIsNone(match(os.path.join('.foo', '.bar'))) + match = re.compile(glob.translate('**/*.*', recursive=True)).match + self.assertIsNone(match(os.path.join('foo', 'bar'))) + self.assertIsNone(match(os.path.join('foo', '.bar'))) + self.assertIsNotNone(match(os.path.join('foo', 'bar.txt'))) + self.assertIsNone(match(os.path.join('foo', '.bar.txt'))) + + def test_translate(self): + def fn(pat): + return glob.translate(pat, seps='/') + self.assertEqual(fn('foo'), r'(?s:foo)\Z') + self.assertEqual(fn('foo/bar'), r'(?s:foo/bar)\Z') + self.assertEqual(fn('*'), r'(?s:[^/.][^/]*)\Z') + self.assertEqual(fn('?'), r'(?s:(?!\.)[^/])\Z') + self.assertEqual(fn('a*'), r'(?s:a[^/]*)\Z') + self.assertEqual(fn('*a'), r'(?s:(?!\.)[^/]*a)\Z') + self.assertEqual(fn('.*'), r'(?s:\.[^/]*)\Z') + self.assertEqual(fn('?aa'), r'(?s:(?!\.)[^/]aa)\Z') + self.assertEqual(fn('aa?'), r'(?s:aa[^/])\Z') + self.assertEqual(fn('aa[ab]'), r'(?s:aa[ab])\Z') + self.assertEqual(fn('**'), r'(?s:(?!\.)[^/]*)\Z') + self.assertEqual(fn('***'), r'(?s:(?!\.)[^/]*)\Z') + self.assertEqual(fn('a**'), r'(?s:a[^/]*)\Z') + self.assertEqual(fn('**b'), r'(?s:(?!\.)[^/]*b)\Z') + self.assertEqual(fn('/**/*/*.*/**'), + r'(?s:/(?!\.)[^/]*/[^/.][^/]*/(?!\.)[^/]*\.[^/]*/(?!\.)[^/]*)\Z') + + def test_translate_include_hidden(self): + def fn(pat): + return glob.translate(pat, include_hidden=True, seps='/') + self.assertEqual(fn('foo'), r'(?s:foo)\Z') + self.assertEqual(fn('foo/bar'), r'(?s:foo/bar)\Z') + self.assertEqual(fn('*'), r'(?s:[^/]+)\Z') + self.assertEqual(fn('?'), r'(?s:[^/])\Z') + self.assertEqual(fn('a*'), r'(?s:a[^/]*)\Z') + self.assertEqual(fn('*a'), r'(?s:[^/]*a)\Z') + self.assertEqual(fn('.*'), r'(?s:\.[^/]*)\Z') + self.assertEqual(fn('?aa'), r'(?s:[^/]aa)\Z') + self.assertEqual(fn('aa?'), r'(?s:aa[^/])\Z') + self.assertEqual(fn('aa[ab]'), r'(?s:aa[ab])\Z') + self.assertEqual(fn('**'), r'(?s:[^/]*)\Z') + self.assertEqual(fn('***'), r'(?s:[^/]*)\Z') + self.assertEqual(fn('a**'), r'(?s:a[^/]*)\Z') + self.assertEqual(fn('**b'), r'(?s:[^/]*b)\Z') + self.assertEqual(fn('/**/*/*.*/**'), r'(?s:/[^/]*/[^/]+/[^/]*\.[^/]*/[^/]*)\Z') + + def test_translate_recursive(self): + def fn(pat): + return glob.translate(pat, recursive=True, include_hidden=True, seps='/') + self.assertEqual(fn('*'), r'(?s:[^/]+)\Z') + self.assertEqual(fn('?'), r'(?s:[^/])\Z') + self.assertEqual(fn('**'), r'(?s:.*)\Z') + self.assertEqual(fn('**/**'), r'(?s:.*)\Z') + self.assertRaises(ValueError, fn, '***') + self.assertRaises(ValueError, fn, 'a**') + self.assertRaises(ValueError, fn, '**b') + self.assertEqual(fn('/**/*/*.*/**'), r'(?s:/(?:.+/)?[^/]+/[^/]*\.[^/]*/.*)\Z') + + def test_translate_seps(self): + def fn(pat): + return glob.translate(pat, recursive=True, include_hidden=True, seps=['/', '\\']) + self.assertEqual(fn('foo/bar\\baz'), r'(?s:foo[/\\]bar[/\\]baz)\Z') + self.assertEqual(fn('**/*'), r'(?s:(?:.+[/\\])?[^/\\]+)\Z') + @skip_unless_symlink class SymlinkLoopGlobTests(unittest.TestCase): diff --git a/Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst b/Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst new file mode 100644 index 00000000000..edc8ab07bb0 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst @@ -0,0 +1,2 @@ +Add :func:`glob.translate`. This function converts a pathname with shell-style +wildcards to a regular expression.