From 7372b06cd7b6b1e88482215ffa27eec8e4b7861a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Charles-Fran=C3=A7ois=20Natali?= Date: Sun, 5 Feb 2012 15:15:38 +0100 Subject: [PATCH] Issue #13734: Add os.fwalk(), a directory walking function yielding file descriptors. --- Doc/library/os.rst | 51 ++++++++++++++++++++++ Doc/whatsnew/3.3.rst | 4 ++ Lib/os.py | 101 ++++++++++++++++++++++++++++++++++++++++--- Lib/test/test_os.py | 60 ++++++++++++++++++++++++- Misc/NEWS | 3 ++ 5 files changed, 212 insertions(+), 7 deletions(-) diff --git a/Doc/library/os.rst b/Doc/library/os.rst index 617f2763603..8601687227f 100644 --- a/Doc/library/os.rst +++ b/Doc/library/os.rst @@ -2251,6 +2251,57 @@ Files and Directories os.rmdir(os.path.join(root, name)) +.. function:: fwalk(top, topdown=True, onerror=None, followlinks=False) + + .. index:: + single: directory; walking + single: directory; traversal + + This behaves exactly like :func:`walk`, except that it yields a 4-tuple + ``(dirpath, dirnames, filenames, dirfd)``. + + *dirpath*, *dirnames* and *filenames* are identical to :func:`walk` output, + and *dirfd* is a file descriptor referring to the directory *dirpath*. + + .. note:: + + Since :func:`fwalk` yields file descriptors, those are only valid until + the next iteration step, so you should duplicate them (e.g. with + :func:`dup`) if you want to keep them longer. + + This example displays the number of bytes taken by non-directory files in each + directory under the starting directory, except that it doesn't look under any + CVS subdirectory:: + + import os + for root, dirs, files, rootfd in os.fwalk('python/Lib/email'): + print(root, "consumes", end="") + print(sum([os.fstatat(rootfd, name).st_size for name in files]), + end="") + print("bytes in", len(files), "non-directory files") + if 'CVS' in dirs: + dirs.remove('CVS') # don't visit CVS directories + + In the next example, walking the tree bottom-up is essential: + :func:`unlinkat` doesn't allow deleting a directory before the directory is + empty:: + + # Delete everything reachable from the directory named in "top", + # assuming there are no symbolic links. + # CAUTION: This is dangerous! For example, if top == '/', it + # could delete all your disk files. + import os + for root, dirs, files, rootfd in os.fwalk(top, topdown=False): + for name in files: + os.unlinkat(rootfd, name) + for name in dirs: + os.unlinkat(rootfd, name, os.AT_REMOVEDIR) + + Availability: Unix. + + .. versionadded:: 3.3 + + .. _os-process: Process Management diff --git a/Doc/whatsnew/3.3.rst b/Doc/whatsnew/3.3.rst index 442ec8dabc8..c20c889550b 100644 --- a/Doc/whatsnew/3.3.rst +++ b/Doc/whatsnew/3.3.rst @@ -497,6 +497,10 @@ os (Patch submitted by Giampaolo RodolĂ  in :issue:`10784`.) +* The :mod:`os` module has a new :func:`~os.fwalk` function similar to + :func:`~os.walk` except that it also yields file descriptors referring to the + directories visited. This is especially useful to avoid symlink races. + * "at" functions (:issue:`4761`): * :func:`~os.faccessat` diff --git a/Lib/os.py b/Lib/os.py index 28979bf8bd2..ad5d5381c04 100644 --- a/Lib/os.py +++ b/Lib/os.py @@ -24,6 +24,7 @@ and opendir), and leave all pathname manipulation to os.path #' import sys, errno +import stat as st _names = sys.builtin_module_names @@ -32,6 +33,9 @@ __all__ = ["altsep", "curdir", "pardir", "sep", "pathsep", "linesep", "defpath", "name", "path", "devnull", "SEEK_SET", "SEEK_CUR", "SEEK_END"] +def _exists(name): + return name in globals() + def _get_exports_list(module): try: return list(module.__all__) @@ -120,7 +124,12 @@ def _get_masked_mode(mode): umask(mask) return mode & ~mask -#' +def _are_same_file(stat1, stat2): + """Helper function that checks whether two stat results refer to the same + file. + """ + return (stat1.st_ino == stat2.st_ino and stat1.st_dev == stat2.st_dev) +# # Super directory utilities. # (Inspired by Eric Raymond; the doc strings are mostly his) @@ -151,7 +160,6 @@ def makedirs(name, mode=0o777, exist_ok=False): try: mkdir(name, mode) except OSError as e: - import stat as st if not (e.errno == errno.EEXIST and exist_ok and path.isdir(name) and st.S_IMODE(lstat(name).st_mode) == _get_masked_mode(mode)): raise @@ -298,6 +306,92 @@ def walk(top, topdown=True, onerror=None, followlinks=False): __all__.append("walk") +if _exists("openat"): + + def fwalk(top, topdown=True, onerror=None, followlinks=False): + """Directory tree generator. + + This behaves exactly like walk(), except that it yields a 4-tuple + + dirpath, dirnames, filenames, dirfd + + `dirpath`, `dirnames` and `filenames` are identical to walk() output, + and `dirfd` is a file descriptor referring to the directory `dirpath`. + + The advantage of walkfd() over walk() is that it's safe against symlink + races (when followlinks is False). + + Caution: + Since fwalk() yields file descriptors, those are only valid until the + next iteration step, so you should dup() them if you want to keep them + for a longer period. + + Example: + + import os + for root, dirs, files, rootfd in os.fwalk('python/Lib/email'): + print(root, "consumes", end="") + print(sum([os.fstatat(rootfd, name).st_size for name in files]), + end="") + print("bytes in", len(files), "non-directory files") + if 'CVS' in dirs: + dirs.remove('CVS') # don't visit CVS directories + """ + # Note: To guard against symlink races, we use the standard + # lstat()/open()/fstat() trick. + orig_st = lstat(top) + topfd = open(top, O_RDONLY) + try: + if (followlinks or (st.S_ISDIR(orig_st.st_mode) and + _are_same_file(orig_st, fstat(topfd)))): + for x in _fwalk(topfd, top, topdown, onerror, followlinks): + yield x + finally: + close(topfd) + + def _fwalk(topfd, toppath, topdown, onerror, followlinks): + # Note: This uses O(depth of the directory tree) file descriptors: if + # necessary, it can be adapted to only require O(1) FDs, see issue + # #13734. + + # whether to follow symlinks + flag = 0 if followlinks else AT_SYMLINK_NOFOLLOW + + names = fdlistdir(topfd) + dirs, nondirs = [], [] + for name in names: + # Here, we don't use AT_SYMLINK_NOFOLLOW to be consistent with + # walk() which reports symlinks to directories as directories. We do + # however check for symlinks before recursing into a subdirectory. + if st.S_ISDIR(fstatat(topfd, name).st_mode): + dirs.append(name) + else: + nondirs.append(name) + + if topdown: + yield toppath, dirs, nondirs, topfd + + for name in dirs: + try: + orig_st = fstatat(topfd, name, flag) + dirfd = openat(topfd, name, O_RDONLY) + except error as err: + if onerror is not None: + onerror(err) + return + try: + if followlinks or _are_same_file(orig_st, fstat(dirfd)): + dirpath = path.join(toppath, name) + for x in _fwalk(dirfd, dirpath, topdown, onerror, followlinks): + yield x + finally: + close(dirfd) + + if not topdown: + yield toppath, dirs, nondirs, topfd + + __all__.append("fwalk") + # Make sure os.environ exists, at least try: environ @@ -598,9 +692,6 @@ def _fscodec(): fsencode, fsdecode = _fscodec() del _fscodec -def _exists(name): - return name in globals() - # Supply spawn*() (probably only for Unix) if _exists("fork") and not _exists("spawnv") and _exists("execv"): diff --git a/Lib/test/test_os.py b/Lib/test/test_os.py index 293005b3a1d..7f955d1e90f 100644 --- a/Lib/test/test_os.py +++ b/Lib/test/test_os.py @@ -20,6 +20,8 @@ import uuid import asyncore import asynchat import socket +import itertools +import stat try: import threading except ImportError: @@ -159,7 +161,6 @@ class StatAttributeTests(unittest.TestCase): if not hasattr(os, "stat"): return - import stat result = os.stat(fname) # Make sure direct access works @@ -476,7 +477,7 @@ class EnvironTests(mapping_tests.BasicTestMappingProtocol): class WalkTests(unittest.TestCase): """Tests for os.walk().""" - def test_traversal(self): + def setUp(self): import os from os.path import join @@ -586,6 +587,60 @@ class WalkTests(unittest.TestCase): os.remove(dirname) os.rmdir(support.TESTFN) + +@unittest.skipUnless(hasattr(os, 'fwalk'), "Test needs os.fwalk()") +class FwalkTests(WalkTests): + """Tests for os.fwalk().""" + + def test_compare_to_walk(self): + # compare with walk() results + for topdown, followlinks in itertools.product((True, False), repeat=2): + args = support.TESTFN, topdown, None, followlinks + expected = {} + for root, dirs, files in os.walk(*args): + expected[root] = (set(dirs), set(files)) + + for root, dirs, files, rootfd in os.fwalk(*args): + self.assertIn(root, expected) + self.assertEqual(expected[root], (set(dirs), set(files))) + + def test_dir_fd(self): + # check returned file descriptors + for topdown, followlinks in itertools.product((True, False), repeat=2): + args = support.TESTFN, topdown, None, followlinks + for root, dirs, files, rootfd in os.fwalk(*args): + # check that the FD is valid + os.fstat(rootfd) + # check that fdlistdir() returns consistent information + self.assertEqual(set(os.fdlistdir(rootfd)), set(dirs) | set(files)) + + def test_fd_leak(self): + # Since we're opening a lot of FDs, we must be careful to avoid leaks: + # we both check that calling fwalk() a large number of times doesn't + # yield EMFILE, and that the minimum allocated FD hasn't changed. + minfd = os.dup(1) + os.close(minfd) + for i in range(256): + for x in os.fwalk(support.TESTFN): + pass + newfd = os.dup(1) + self.addCleanup(os.close, newfd) + self.assertEqual(newfd, minfd) + + def tearDown(self): + # cleanup + for root, dirs, files, rootfd in os.fwalk(support.TESTFN, topdown=False): + for name in files: + os.unlinkat(rootfd, name) + for name in dirs: + st = os.fstatat(rootfd, name, os.AT_SYMLINK_NOFOLLOW) + if stat.S_ISDIR(st.st_mode): + os.unlinkat(rootfd, name, os.AT_REMOVEDIR) + else: + os.unlinkat(rootfd, name) + os.rmdir(support.TESTFN) + + class MakedirTests(unittest.TestCase): def setUp(self): os.mkdir(support.TESTFN) @@ -1700,6 +1755,7 @@ def test_main(): StatAttributeTests, EnvironTests, WalkTests, + FwalkTests, MakedirTests, DevNullTests, URandomTests, diff --git a/Misc/NEWS b/Misc/NEWS index 47098289863..d1f5cb3ec6a 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -466,6 +466,9 @@ Core and Builtins Library ------- +- Issue #13734: Add os.fwalk(), a directory walking function yielding file + descriptors. + - Issue #2945: Make the distutils upload command aware of bdist_rpm products. - Issue #13712: pysetup create should not convert package_data to extra_files.