diff --git a/Doc/library/os.rst b/Doc/library/os.rst --- a/Doc/library/os.rst +++ b/Doc/library/os.rst @@ -2240,6 +2240,58 @@ os.rmdir(os.path.join(root, name)) +.. function:: fdwalk(top, topdown=True, onerror=None, followlinks=False) + + .. index:: + single: directory; walking + single: directory; traversal + + This behaves exactly like :func:`walk`, except that it yields a 4-tuple + ``(dirpath, dirnames, filenames, dirfd)``. + + *dirpath*, *dirnames* and *filenames* are identical to :func:`walk` output, + and *dirfd* is a file descriptor referring to the directory *dirpath*. + + .. note:: + + Since :func:`fdwalk` yields file descriptors, those are only valid until + the next iteration step, so you should duplicate them (e.g. with + :func:`dup`) if you want to keep them longer. + + .. note:: + + Contrarily to :func:`walk`, modifying the dirnames list in-place won't + affect the directories traversed. + + This example displays the number of bytes taken by non-directory files in each + directory under the starting directory:: + + import os + for root, dirs, files, rootfd in os.fdwalk('python/Lib/email'): + print(root, "consumes", end="") + print(sum([os.fstatat(rootfd, name).st_size for name in files]), + end="") + print("bytes in", len(files), "non-directory files") + + In the next example, walking the tree bottom-up is essential: + :func:`unlinkat` doesn't allow deleting a directory before the directory is + empty:: + + # Delete everything reachable from the directory named in "top", + # assuming there are no symbolic links. + # CAUTION: This is dangerous! For example, if top == '/', it + # could delete all your disk files. + import os + for root, dirs, files, rootfd in os.fdwalk(top, topdown=False): + for name in files: + os.unlinkat(rootfd, name) + for name in dirs: + os.unlinkat(rootfd, name, os.AT_REMOVEDIR) + + Availability: Unix. + + .. versionadded:: 3.3 + .. _os-process: Process Management diff --git a/Doc/whatsnew/3.3.rst b/Doc/whatsnew/3.3.rst --- a/Doc/whatsnew/3.3.rst +++ b/Doc/whatsnew/3.3.rst @@ -478,6 +478,10 @@ (Patch submitted by Giampaolo RodolĂ  in :issue:`10784`.) +* The :mod:`os` module has a new :func:`~os.fdwalk` function similar to + :func:`~os.walk` except that it also yields file descriptors referring to the + directories visited. This is especially useful to avoid symlink races. + * "at" functions (:issue:`4761`): * :func:`~os.faccessat` diff --git a/Lib/os.py b/Lib/os.py --- a/Lib/os.py +++ b/Lib/os.py @@ -24,6 +24,7 @@ #' import sys, errno +import stat as st _names = sys.builtin_module_names @@ -32,6 +33,9 @@ "defpath", "name", "path", "devnull", "SEEK_SET", "SEEK_CUR", "SEEK_END"] +def _exists(name): + return name in globals() + def _get_exports_list(module): try: return list(module.__all__) @@ -120,7 +124,13 @@ umask(mask) return mode & ~mask -#' +def _are_same_file(stat1, stat2): + """Helper function that checks whether two stat results refer to the same + file. + """ + return (stat1.st_mode == stat2.st_mode and stat1.st_ino == stat2.st_ino and + stat1.st_dev == stat2.st_dev) +# # Super directory utilities. # (Inspired by Eric Raymond; the doc strings are mostly his) @@ -151,7 +161,6 @@ try: mkdir(name, mode) except OSError as e: - import stat as st if not (e.errno == errno.EEXIST and exist_ok and path.isdir(name) and st.S_IMODE(lstat(name).st_mode) == _get_masked_mode(mode)): raise @@ -298,6 +307,94 @@ __all__.append("walk") +if _exists("openat"): + + def fdwalk(top, topdown=True, onerror=None, followlinks=False): + """Directory tree generator. + + This behaves exactly like walk(), except that it yields a 4-tuple + + dirpath, dirnames, filenames, dirfd + + `dirpath`, `dirnames` and `filenames` are identical to walk() output, + and `dirfd` is a file descriptor referring to the directory `dirpath`. + + The advantage of walkfd() over walk() is that it's safe against symlink + races (when followlinks is False). + + Caution: + Since fdwalk() yields file descriptors, those are only valid until the + next iteration step, so you should dup() them if you want to keep them + for a longer period. + Also, contrarily to walk(), modifying the `dirnames` list in-place won't + affect the directories traversed. + + Example: + + import os + for root, dirs, files, rootfd in os.fdwalk('python/Lib/email'): + print(root, "consumes", end="") + print(sum([os.fstatat(rootfd, name).st_size for name in files]), + end="") + print("bytes in", len(files), "non-directory files") + """ + # Note: To guard against symlink races, we use the standard + # lstat()/open()/fstat() trick. + orig_st = lstat(top) + topfd = open(top, O_RDONLY) + try: + if (followlinks or (st.S_ISDIR(orig_st.st_mode) and + _are_same_file(orig_st, fstat(topfd)))): + for x in _fdwalk(topfd, top, topdown, onerror, followlinks): + yield x + finally: + close(topfd) + + def _fdwalk(topfd, toppath, topdown, onerror, followlinks): + try: + names = fdlistdir(topfd) + except error as err: + if onerror is not None: + onerror(err) + return + + dirs, nondirs = [], [] + for name in names: + # Here, we don't use AT_SYMLINK_NOFOLLOW to be consistent with + # walk() which reports symlinks to directories as directories. We do + # however check for symlinks before recursing into a subdirectory. + if st.S_ISDIR(fstatat(topfd, name).st_mode): + dirs.append(name) + else: + nondirs.append(name) + + # whether to follow symlinks + flag = 0 if followlinks else AT_SYMLINK_NOFOLLOW + + if topdown: + yield toppath, dirs, nondirs, topfd + + for name in dirs: + try: + orig_st = fstatat(topfd, name, flag) + dirfd = openat(topfd, name, O_RDONLY) + except error as err: + if onerror is not None: + onerror(err) + return + try: + if followlinks or _are_same_file(orig_st, fstat(dirfd)): + dirpath = path.join(toppath, name) + for x in _fdwalk(dirfd, dirpath, topdown, onerror, followlinks): + yield x + finally: + close(dirfd) + + if not topdown: + yield toppath, dirs, nondirs, topfd + + __all__.append("fdwalk") + # Make sure os.environ exists, at least try: environ @@ -598,9 +695,6 @@ fsencode, fsdecode = _fscodec() del _fscodec -def _exists(name): - return name in globals() - # Supply spawn*() (probably only for Unix) if _exists("fork") and not _exists("spawnv") and _exists("execv"): diff --git a/Lib/test/test_os.py b/Lib/test/test_os.py --- a/Lib/test/test_os.py +++ b/Lib/test/test_os.py @@ -20,6 +20,8 @@ import asyncore import asynchat import socket +import itertools +import stat try: import threading except ImportError: @@ -147,7 +149,6 @@ if not hasattr(os, "stat"): return - import stat result = os.stat(fname) # Make sure direct access works @@ -464,7 +465,7 @@ class WalkTests(unittest.TestCase): """Tests for os.walk().""" - def test_traversal(self): + def setUp(self): import os from os.path import join @@ -569,6 +570,58 @@ os.remove(dirname) os.rmdir(support.TESTFN) +@unittest.skipUnless(hasattr(os, 'fdwalk'), "Test needs os.fdwalk()") +class FdWalkTests(WalkTests): + """Tests for os.fdwalk().""" + + def test_compare_to_walk(self): + # compare with walk() results + for topdown, followlinks in itertools.product((True, False), repeat=2): + args = support.TESTFN, topdown, None, followlinks + expected = {} + for root, dirs, files in os.walk(*args): + expected[root] = (set(dirs), set(files)) + + for root, dirs, files, rootfd in os.fdwalk(*args): + self.assertIn(root, expected) + self.assertEqual(expected[root], (set(dirs), set(files))) + + def test_dir_fd(self): + # check returned file descriptors + for topdown, followlinks in itertools.product((True, False), repeat=2): + args = support.TESTFN, topdown, None, followlinks + for root, dirs, files, rootfd in os.fdwalk(*args): + # check that the FD is valid + os.fstat(rootfd) + # check that fdlistdir() returns consistent information + self.assertEqual(set(os.fdlistdir(rootfd)), set(dirs) | set(files)) + + def test_fd_leak(self): + # Since we're opening a lot of FDs, we must be careful to avoid leaks: + # we both check that calling fdwalk() a large number of times doesn't + # yield EMFILE, and that the minimum allocated FD hasn't changed. + minfd = os.dup(1) + os.close(minfd) + for i in range(512): + for x in os.fdwalk(support.TESTFN): + pass + newfd = os.dup(1) + self.addCleanup(os.close, newfd) + self.assertEqual(newfd, minfd) + + def tearDown(self): + # cleanup + for root, dirs, files, rootfd in os.fdwalk(support.TESTFN, topdown=False): + for name in files: + os.unlinkat(rootfd, name) + for name in dirs: + st = os.fstatat(rootfd, name, os.AT_SYMLINK_NOFOLLOW) + if stat.S_ISDIR(st.st_mode): + os.unlinkat(rootfd, name, os.AT_REMOVEDIR) + else: + os.unlinkat(rootfd, name) + os.rmdir(support.TESTFN) + class MakedirTests(unittest.TestCase): def setUp(self): os.mkdir(support.TESTFN) @@ -1683,6 +1736,7 @@ StatAttributeTests, EnvironTests, WalkTests, + FdWalkTests, MakedirTests, DevNullTests, URandomTests,