GH-114847: Speed up `posixpath.realpath()` (#114848)

Apply the following optimizations to `posixpath.realpath()`:

- Remove use of recursion
- Construct child paths directly rather than using `join()`
- Use `os.getcwd[b]()` rather than `abspath()`
- Use `startswith(sep)` rather than `isabs()`
- Use slicing rather than `split()`

Co-authored-by: Petr Viktorin <encukou@gmail.com>
This commit is contained in:
Barney Gale 2024-04-05 13:35:01 +01:00 committed by GitHub
parent 9ceaee74db
commit abfa16b44b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 64 additions and 34 deletions

View File

@ -403,55 +403,66 @@ def realpath(filename, *, strict=False):
"""Return the canonical path of the specified filename, eliminating any """Return the canonical path of the specified filename, eliminating any
symbolic links encountered in the path.""" symbolic links encountered in the path."""
filename = os.fspath(filename) filename = os.fspath(filename)
path, ok = _joinrealpath(filename[:0], filename, strict, {}) if isinstance(filename, bytes):
return abspath(path)
# Join two paths, normalizing and eliminating any symbolic links
# encountered in the second path.
# Two leading slashes are replaced by a single slash.
def _joinrealpath(path, rest, strict, seen):
if isinstance(path, bytes):
sep = b'/' sep = b'/'
curdir = b'.' curdir = b'.'
pardir = b'..' pardir = b'..'
getcwd = os.getcwdb
else: else:
sep = '/' sep = '/'
curdir = '.' curdir = '.'
pardir = '..' pardir = '..'
getcwd = os.getcwd
if rest.startswith(sep): # The stack of unresolved path parts. When popped, a special value of None
rest = rest[1:] # indicates that a symlink target has been resolved, and that the original
path = sep # symlink path can be retrieved by popping again. The [::-1] slice is a
# very fast way of spelling list(reversed(...)).
rest = filename.split(sep)[::-1]
# The resolved path, which is absolute throughout this function.
# Note: getcwd() returns a normalized and symlink-free path.
path = sep if filename.startswith(sep) else getcwd()
# Mapping from symlink paths to *fully resolved* symlink targets. If a
# symlink is encountered but not yet resolved, the value is None. This is
# used both to detect symlink loops and to speed up repeated traversals of
# the same links.
seen = {}
# Whether we're calling lstat() and readlink() to resolve symlinks. If we
# encounter an OSError for a symlink loop in non-strict mode, this is
# switched off.
querying = True
while rest: while rest:
name, _, rest = rest.partition(sep) name = rest.pop()
if name is None:
# resolved symlink target
seen[rest.pop()] = path
continue
if not name or name == curdir: if not name or name == curdir:
# current dir # current dir
continue continue
if name == pardir: if name == pardir:
# parent dir # parent dir
if path: path = path[:path.rindex(sep)] or sep
parent, name = split(path) continue
if name == pardir: if path == sep:
# ../.. newpath = path + name
path = join(path, pardir) else:
else: newpath = path + sep + name
# foo/bar/.. -> foo if not querying:
path = parent path = newpath
else:
# ..
path = pardir
continue continue
newpath = join(path, name)
try: try:
st = os.lstat(newpath) st = os.lstat(newpath)
if not stat.S_ISLNK(st.st_mode):
path = newpath
continue
except OSError: except OSError:
if strict: if strict:
raise raise
is_link = False
else:
is_link = stat.S_ISLNK(st.st_mode)
if not is_link:
path = newpath path = newpath
continue continue
# Resolve the symbolic link # Resolve the symbolic link
@ -467,14 +478,23 @@ def _joinrealpath(path, rest, strict, seen):
os.stat(newpath) os.stat(newpath)
else: else:
# Return already resolved part + rest of the path unchanged. # Return already resolved part + rest of the path unchanged.
return join(newpath, rest), False path = newpath
querying = False
continue
seen[newpath] = None # not resolved symlink seen[newpath] = None # not resolved symlink
path, ok = _joinrealpath(path, os.readlink(newpath), strict, seen) target = os.readlink(newpath)
if not ok: if target.startswith(sep):
return join(path, rest), False # Symlink target is absolute; reset resolved path.
seen[newpath] = path # resolved symlink path = sep
# Push the symlink path onto the stack, and signal its specialness by
# also pushing None. When these entries are popped, we'll record the
# fully-resolved symlink target in the 'seen' mapping.
rest.append(newpath)
rest.append(None)
# Push the unresolved symlink target parts onto the stack.
rest.extend(target.split(sep)[::-1])
return path, True return path
supports_unicode_filenames = (sys.platform == 'darwin') supports_unicode_filenames = (sys.platform == 'darwin')

View File

@ -456,6 +456,15 @@ class PosixPathTest(unittest.TestCase):
finally: finally:
os_helper.unlink(ABSTFN) os_helper.unlink(ABSTFN)
@os_helper.skip_unless_symlink
@skip_if_ABSTFN_contains_backslash
def test_realpath_missing_pardir(self):
try:
os.symlink(os_helper.TESTFN + "1", os_helper.TESTFN)
self.assertEqual(realpath("nonexistent/../" + os_helper.TESTFN), ABSTFN + "1")
finally:
os_helper.unlink(os_helper.TESTFN)
@os_helper.skip_unless_symlink @os_helper.skip_unless_symlink
@skip_if_ABSTFN_contains_backslash @skip_if_ABSTFN_contains_backslash
def test_realpath_symlink_loops(self): def test_realpath_symlink_loops(self):

View File

@ -0,0 +1 @@
Speed up :func:`os.path.realpath` on non-Windows platforms.