GH-101362: Omit path anchor from `pathlib.PurePath()._parts` (GH-102476)

Improve performance of path construction by skipping the addition of the path anchor (`drive + root`) to the internal `_parts` list. Rename this attribute to `_tail` for clarity.
This commit is contained in:
Barney Gale 2023-04-09 18:40:03 +01:00 committed by GitHub
parent 0a675f4bb5
commit 2c673d5e93
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 108 additions and 67 deletions

View File

@ -210,20 +210,17 @@ class _RecursiveWildcardSelector(_Selector):
class _PathParents(Sequence):
"""This object provides sequence-like access to the logical ancestors
of a path. Don't try to construct it yourself."""
__slots__ = ('_pathcls', '_drv', '_root', '_parts')
__slots__ = ('_pathcls', '_drv', '_root', '_tail')
def __init__(self, path):
# We don't store the instance to avoid reference cycles
self._pathcls = type(path)
self._drv = path.drive
self._root = path.root
self._parts = path._parts
self._tail = path._tail
def __len__(self):
if self._drv or self._root:
return len(self._parts) - 1
else:
return len(self._parts)
return len(self._tail)
def __getitem__(self, idx):
if isinstance(idx, slice):
@ -234,7 +231,7 @@ class _PathParents(Sequence):
if idx < 0:
idx += len(self)
return self._pathcls._from_parsed_parts(self._drv, self._root,
self._parts[:-idx - 1])
self._tail[:-idx - 1])
def __repr__(self):
return "<{}.parents>".format(self._pathcls.__name__)
@ -249,9 +246,41 @@ class PurePath(object):
PureWindowsPath object. You can also instantiate either of these classes
directly, regardless of your system.
"""
__slots__ = (
'_raw_path', '_drv', '_root', '_parts_cached',
'_str', '_hash', '_parts_tuple', '_parts_normcase_cached',
# The `_raw_path` slot stores an unnormalized string path. This is set
# in the `__init__()` method.
'_raw_path',
# The `_drv`, `_root` and `_tail_cached` slots store parsed and
# normalized parts of the path. They are set when any of the `drive`,
# `root` or `_tail` properties are accessed for the first time. The
# three-part division corresponds to the result of
# `os.path.splitroot()`, except that the tail is further split on path
# separators (i.e. it is a list of strings), and that the root and
# tail are normalized.
'_drv', '_root', '_tail_cached',
# The `_str` slot stores the string representation of the path,
# computed from the drive, root and tail when `__str__()` is called
# for the first time. It's used to implement `_str_normcase`
'_str',
# The `_str_normcase_cached` slot stores the string path with
# normalized case. It is set when the `_str_normcase` property is
# accessed for the first time. It's used to implement `__eq__()`
# `__hash__()`, and `_parts_normcase`
'_str_normcase_cached',
# The `_parts_normcase_cached` slot stores the case-normalized
# string path after splitting on path separators. It's set when the
# `_parts_normcase` property is accessed for the first time. It's used
# to implement comparison methods like `__lt__()`.
'_parts_normcase_cached',
# The `_hash` slot stores the hash of the case-normalized string
# path. It's set when `__hash__()` is called for the first time.
'_hash',
)
_flavour = os.path
@ -277,10 +306,7 @@ class PurePath(object):
path = os.fspath(args[0])
else:
path = self._flavour.join(*args)
if isinstance(path, str):
# Force-cast str subclasses to str (issue #21127)
path = str(path)
else:
if not isinstance(path, str):
raise TypeError(
"argument should be a str or an os.PathLike "
"object where __fspath__ returns a str, "
@ -299,33 +325,32 @@ class PurePath(object):
if drv.startswith(sep):
# pathlib assumes that UNC paths always have a root.
root = sep
unfiltered_parsed = [drv + root] + rel.split(sep)
parsed = [sys.intern(x) for x in unfiltered_parsed if x and x != '.']
parsed = [sys.intern(str(x)) for x in rel.split(sep) if x and x != '.']
return drv, root, parsed
def _load_parts(self):
drv, root, parts = self._parse_path(self._raw_path)
drv, root, tail = self._parse_path(self._raw_path)
self._drv = drv
self._root = root
self._parts_cached = parts
self._tail_cached = tail
@classmethod
def _from_parsed_parts(cls, drv, root, parts):
path = cls._format_parsed_parts(drv, root, parts)
def _from_parsed_parts(cls, drv, root, tail):
path = cls._format_parsed_parts(drv, root, tail)
self = cls(path)
self._str = path or '.'
self._drv = drv
self._root = root
self._parts_cached = parts
self._tail_cached = tail
return self
@classmethod
def _format_parsed_parts(cls, drv, root, parts):
def _format_parsed_parts(cls, drv, root, tail):
if drv or root:
return drv + root + cls._flavour.sep.join(parts[1:])
elif parts and cls._flavour.splitdrive(parts[0])[0]:
parts = ['.'] + parts
return cls._flavour.sep.join(parts)
return drv + root + cls._flavour.sep.join(tail)
elif tail and cls._flavour.splitdrive(tail[0])[0]:
tail = ['.'] + tail
return cls._flavour.sep.join(tail)
def __str__(self):
"""Return the string representation of the path, suitable for
@ -334,7 +359,7 @@ class PurePath(object):
return self._str
except AttributeError:
self._str = self._format_parsed_parts(self.drive, self.root,
self._parts) or '.'
self._tail) or '.'
return self._str
def __fspath__(self):
@ -374,25 +399,34 @@ class PurePath(object):
path = str(self)
return prefix + urlquote_from_bytes(os.fsencode(path))
@property
def _str_normcase(self):
# String with normalized case, for hashing and equality checks
try:
return self._str_normcase_cached
except AttributeError:
self._str_normcase_cached = self._flavour.normcase(str(self))
return self._str_normcase_cached
@property
def _parts_normcase(self):
# Cached parts with normalized case, for hashing and comparison.
# Cached parts with normalized case, for comparisons.
try:
return self._parts_normcase_cached
except AttributeError:
self._parts_normcase_cached = [self._flavour.normcase(p) for p in self._parts]
self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep)
return self._parts_normcase_cached
def __eq__(self, other):
if not isinstance(other, PurePath):
return NotImplemented
return self._parts_normcase == other._parts_normcase and self._flavour is other._flavour
return self._str_normcase == other._str_normcase and self._flavour is other._flavour
def __hash__(self):
try:
return self._hash
except AttributeError:
self._hash = hash(tuple(self._parts_normcase))
self._hash = hash(self._str_normcase)
return self._hash
def __lt__(self, other):
@ -434,12 +468,12 @@ class PurePath(object):
return self._root
@property
def _parts(self):
def _tail(self):
try:
return self._parts_cached
return self._tail_cached
except AttributeError:
self._load_parts()
return self._parts_cached
return self._tail_cached
@property
def anchor(self):
@ -450,10 +484,10 @@ class PurePath(object):
@property
def name(self):
"""The final path component, if any."""
parts = self._parts
if len(parts) == (1 if (self.drive or self.root) else 0):
tail = self._tail
if not tail:
return ''
return parts[-1]
return tail[-1]
@property
def suffix(self):
@ -501,7 +535,7 @@ class PurePath(object):
if drv or root or not tail or f.sep in tail or (f.altsep and f.altsep in tail):
raise ValueError("Invalid name %r" % (name))
return self._from_parsed_parts(self.drive, self.root,
self._parts[:-1] + [name])
self._tail[:-1] + [name])
def with_stem(self, stem):
"""Return a new path with the stem changed."""
@ -526,7 +560,7 @@ class PurePath(object):
else:
name = name[:-len(old_suffix)] + suffix
return self._from_parsed_parts(self.drive, self.root,
self._parts[:-1] + [name])
self._tail[:-1] + [name])
def relative_to(self, other, /, *_deprecated, walk_up=False):
"""Return the relative path to another path identified by the passed
@ -551,7 +585,7 @@ class PurePath(object):
raise ValueError(f"{str(self)!r} and {str(other)!r} have different anchors")
if step and not walk_up:
raise ValueError(f"{str(self)!r} is not in the subpath of {str(other)!r}")
parts = ('..',) * step + self.parts[len(path.parts):]
parts = ['..'] * step + self._tail[len(path._tail):]
return path_cls(*parts)
def is_relative_to(self, other, /, *_deprecated):
@ -570,13 +604,10 @@ class PurePath(object):
def parts(self):
"""An object providing sequence-like access to the
components in the filesystem path."""
# We cache the tuple to avoid building a new one each time .parts
# is accessed. XXX is this necessary?
try:
return self._parts_tuple
except AttributeError:
self._parts_tuple = tuple(self._parts)
return self._parts_tuple
if self.drive or self.root:
return (self.drive + self.root,) + tuple(self._tail)
else:
return tuple(self._tail)
def joinpath(self, *args):
"""Combine this path with one or several arguments, and return a
@ -603,10 +634,10 @@ class PurePath(object):
"""The logical parent of the path."""
drv = self.drive
root = self.root
parts = self._parts
if len(parts) == 1 and (drv or root):
tail = self._tail
if not tail:
return self
return self._from_parsed_parts(drv, root, parts[:-1])
return self._from_parsed_parts(drv, root, tail[:-1])
@property
def parents(self):
@ -624,29 +655,29 @@ class PurePath(object):
def is_reserved(self):
"""Return True if the path contains one of the special names reserved
by the system, if any."""
if self._flavour is posixpath or not self._parts:
if self._flavour is posixpath or not self._tail:
return False
# NOTE: the rules for reserved names seem somewhat complicated
# (e.g. r"..\NUL" is reserved but not r"foo\NUL" if "foo" does not
# exist). We err on the side of caution and return True for paths
# which are not considered reserved by Windows.
if self._parts[0].startswith('\\\\'):
if self.drive.startswith('\\\\'):
# UNC paths are never reserved.
return False
name = self._parts[-1].partition('.')[0].partition(':')[0].rstrip(' ')
name = self._tail[-1].partition('.')[0].partition(':')[0].rstrip(' ')
return name.upper() in _WIN_RESERVED_NAMES
def match(self, path_pattern):
"""
Return True if this path matches the given pattern.
"""
path_pattern = self._flavour.normcase(path_pattern)
drv, root, pat_parts = self._parse_path(path_pattern)
if not pat_parts:
pat = type(self)(path_pattern)
if not pat.parts:
raise ValueError("empty pattern")
pat_parts = pat._parts_normcase
parts = self._parts_normcase
if drv or root:
if pat.drive or pat.root:
if len(pat_parts) != len(parts):
return False
elif len(pat_parts) > len(parts):
@ -707,11 +738,21 @@ class Path(PurePath):
cls = WindowsPath if os.name == 'nt' else PosixPath
return object.__new__(cls)
def _make_child_relpath(self, part):
# This is an optimization used for dir walking. `part` must be
# a single part relative to this path.
parts = self._parts + [part]
return self._from_parsed_parts(self.drive, self.root, parts)
def _make_child_relpath(self, name):
path_str = str(self)
tail = self._tail
if tail:
path_str = f'{path_str}{self._flavour.sep}{name}'
elif path_str != '.':
path_str = f'{path_str}{name}'
else:
path_str = name
path = type(self)(path_str)
path._str = path_str
path._drv = self.drive
path._root = self.root
path._tail_cached = tail + [name]
return path
def __enter__(self):
# In previous versions of pathlib, __exit__() marked this path as
@ -1196,12 +1237,12 @@ class Path(PurePath):
(as returned by os.path.expanduser)
"""
if (not (self.drive or self.root) and
self._parts and self._parts[0][:1] == '~'):
homedir = self._flavour.expanduser(self._parts[0])
self._tail and self._tail[0][:1] == '~'):
homedir = self._flavour.expanduser(self._tail[0])
if homedir[:1] == "~":
raise RuntimeError("Could not determine home directory.")
drv, root, parts = self._parse_path(homedir)
return self._from_parsed_parts(drv, root, parts + self._parts[1:])
drv, root, tail = self._parse_path(homedir)
return self._from_parsed_parts(drv, root, tail + self._tail[1:])
return self

View File

@ -346,8 +346,6 @@ class _BasePurePathTest(object):
p = P('a/b')
parts = p.parts
self.assertEqual(parts, ('a', 'b'))
# The object gets reused.
self.assertIs(parts, p.parts)
# When the path is absolute, the anchor is a separate part.
p = P('/a/b')
parts = p.parts

View File

@ -0,0 +1,2 @@
Speed up :class:`pathlib.Path` construction by omitting the path anchor from
the internal list of path parts.