mirror of https://github.com/python/cpython
bpo-35823: subprocess: Use vfork() instead of fork() on Linux when safe (GH-11671)
* bpo-35823: subprocess: Use vfork() instead of fork() on Linux when safe When used to run a new executable image, fork() is not a good choice for process creation, especially if the parent has a large working set: fork() needs to copy page tables, which is slow, and may fail on systems where overcommit is disabled, despite that the child is not going to touch most of its address space. Currently, subprocess is capable of using posix_spawn() instead, which normally provides much better performance. However, posix_spawn() does not support many of child setup operations exposed by subprocess.Popen(). Most notably, it's not possible to express `close_fds=True`, which happens to be the default, via posix_spawn(). As a result, most users can't benefit from faster process creation, at least not without changing their code. However, Linux provides vfork() system call, which creates a new process without copying the address space of the parent, and which is actually used by C libraries to efficiently implement posix_spawn(). Due to sharing of the address space and even the stack with the parent, extreme care is required to use vfork(). At least the following restrictions must hold: * No signal handlers must execute in the child process. Otherwise, they might clobber memory shared with the parent, potentially confusing it. * Any library function called after vfork() in the child must be async-signal-safe (as for fork()), but it must also not interact with any library state in a way that might break due to address space sharing and/or lack of any preparations performed by libraries on normal fork(). POSIX.1 permits to call only execve() and _exit(), and later revisions remove vfork() specification entirely. In practice, however, almost all operations needed by subprocess.Popen() can be safely implemented on Linux. * Due to sharing of the stack with the parent, the child must be careful not to clobber local variables that are alive across vfork() call. Compilers are normally aware of this and take extra care with vfork() (and setjmp(), which has a similar problem). * In case the parent is privileged, special attention must be paid to vfork() use, because sharing an address space across different privilege domains is insecure[1]. This patch adds support for using vfork() instead of fork() on Linux when it's possible to do safely given the above. In particular: * vfork() is not used if credential switch is requested. The reverse case (simple subprocess.Popen() but another application thread switches credentials concurrently) is not possible for pure-Python apps because subprocess.Popen() and functions like os.setuid() are mutually excluded via GIL. We might also consider to add a way to opt-out of vfork() (and posix_spawn() on platforms where it might be implemented via vfork()) in a future PR. * vfork() is not used if `preexec_fn != None`. With this change, subprocess will still use posix_spawn() if possible, but will fallback to vfork() on Linux in most cases, and, failing that, to fork(). [1] https://ewontfix.com/7 Co-authored-by: Gregory P. Smith [Google LLC] <gps@google.com>
This commit is contained in:
parent
16ee68da6e
commit
976da903a7
|
@ -0,0 +1,2 @@
|
||||||
|
Use ``vfork()`` instead of ``fork()`` for :func:`subprocess.Popen` on Linux
|
||||||
|
to improve performance in cases where it is deemed safe.
|
|
@ -36,6 +36,12 @@
|
||||||
# define SYS_getdents64 __NR_getdents64
|
# define SYS_getdents64 __NR_getdents64
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__linux__) && defined(HAVE_VFORK) && defined(HAVE_SIGNAL_H) && \
|
||||||
|
defined(HAVE_PTHREAD_SIGMASK) && !defined(HAVE_BROKEN_PTHREAD_SIGMASK)
|
||||||
|
# include <signal.h>
|
||||||
|
# define VFORK_USABLE 1
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__sun) && defined(__SVR4)
|
#if defined(__sun) && defined(__SVR4)
|
||||||
/* readdir64 is used to work around Solaris 9 bug 6395699. */
|
/* readdir64 is used to work around Solaris 9 bug 6395699. */
|
||||||
# define readdir readdir64
|
# define readdir readdir64
|
||||||
|
@ -407,9 +413,53 @@ _close_open_fds_maybe_unsafe(long start_fd, PyObject* py_fds_to_keep)
|
||||||
#endif /* else NOT (defined(__linux__) && defined(HAVE_SYS_SYSCALL_H)) */
|
#endif /* else NOT (defined(__linux__) && defined(HAVE_SYS_SYSCALL_H)) */
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef VFORK_USABLE
|
||||||
|
/* Reset dispositions for all signals to SIG_DFL except for ignored
|
||||||
|
* signals. This way we ensure that no signal handlers can run
|
||||||
|
* after we unblock signals in a child created by vfork().
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
reset_signal_handlers(const sigset_t *child_sigmask)
|
||||||
|
{
|
||||||
|
struct sigaction sa_dfl = {.sa_handler = SIG_DFL};
|
||||||
|
for (int sig = 1; sig < _NSIG; sig++) {
|
||||||
|
/* Dispositions for SIGKILL and SIGSTOP can't be changed. */
|
||||||
|
if (sig == SIGKILL || sig == SIGSTOP) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* There is no need to reset the disposition of signals that will
|
||||||
|
* remain blocked across execve() since the kernel will do it. */
|
||||||
|
if (sigismember(child_sigmask, sig) == 1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct sigaction sa;
|
||||||
|
/* C libraries usually return EINVAL for signals used
|
||||||
|
* internally (e.g. for thread cancellation), so simply
|
||||||
|
* skip errors here. */
|
||||||
|
if (sigaction(sig, NULL, &sa) == -1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* void *h works as these fields are both pointer types already. */
|
||||||
|
void *h = (sa.sa_flags & SA_SIGINFO ? (void *)sa.sa_sigaction :
|
||||||
|
(void *)sa.sa_handler);
|
||||||
|
if (h == SIG_IGN || h == SIG_DFL) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* This call can't reasonably fail, but if it does, terminating
|
||||||
|
* the child seems to be too harsh, so ignore errors. */
|
||||||
|
(void) sigaction(sig, &sa_dfl, NULL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif /* VFORK_USABLE */
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This function is code executed in the child process immediately after fork
|
* This function is code executed in the child process immediately after
|
||||||
* to set things up and call exec().
|
* (v)fork to set things up and call exec().
|
||||||
*
|
*
|
||||||
* All of the code in this function must only use async-signal-safe functions,
|
* All of the code in this function must only use async-signal-safe functions,
|
||||||
* listed at `man 7 signal` or
|
* listed at `man 7 signal` or
|
||||||
|
@ -417,8 +467,28 @@ _close_open_fds_maybe_unsafe(long start_fd, PyObject* py_fds_to_keep)
|
||||||
*
|
*
|
||||||
* This restriction is documented at
|
* This restriction is documented at
|
||||||
* http://www.opengroup.org/onlinepubs/009695399/functions/fork.html.
|
* http://www.opengroup.org/onlinepubs/009695399/functions/fork.html.
|
||||||
|
*
|
||||||
|
* If this function is called after vfork(), even more care must be taken.
|
||||||
|
* The lack of preparations that C libraries normally take on fork(),
|
||||||
|
* as well as sharing the address space with the parent, might make even
|
||||||
|
* async-signal-safe functions vfork-unsafe. In particular, on Linux,
|
||||||
|
* set*id() and setgroups() library functions must not be called, since
|
||||||
|
* they have to interact with the library-level thread list and send
|
||||||
|
* library-internal signals to implement per-process credentials semantics
|
||||||
|
* required by POSIX but not supported natively on Linux. Another reason to
|
||||||
|
* avoid this family of functions is that sharing an address space between
|
||||||
|
* processes running with different privileges is inherently insecure.
|
||||||
|
* See bpo-35823 for further discussion and references.
|
||||||
|
*
|
||||||
|
* In some C libraries, setrlimit() has the same thread list/signalling
|
||||||
|
* behavior since resource limits were per-thread attributes before
|
||||||
|
* Linux 2.6.10. Musl, as of 1.2.1, is known to have this issue
|
||||||
|
* (https://www.openwall.com/lists/musl/2020/10/15/6).
|
||||||
|
*
|
||||||
|
* If vfork-unsafe functionality is desired after vfork(), consider using
|
||||||
|
* syscall() to obtain it.
|
||||||
*/
|
*/
|
||||||
static void
|
_Py_NO_INLINE static void
|
||||||
child_exec(char *const exec_array[],
|
child_exec(char *const exec_array[],
|
||||||
char *const argv[],
|
char *const argv[],
|
||||||
char *const envp[],
|
char *const envp[],
|
||||||
|
@ -432,6 +502,7 @@ child_exec(char *const exec_array[],
|
||||||
int call_setgid, gid_t gid,
|
int call_setgid, gid_t gid,
|
||||||
int call_setgroups, size_t groups_size, const gid_t *groups,
|
int call_setgroups, size_t groups_size, const gid_t *groups,
|
||||||
int call_setuid, uid_t uid, int child_umask,
|
int call_setuid, uid_t uid, int child_umask,
|
||||||
|
const void *child_sigmask,
|
||||||
PyObject *py_fds_to_keep,
|
PyObject *py_fds_to_keep,
|
||||||
PyObject *preexec_fn,
|
PyObject *preexec_fn,
|
||||||
PyObject *preexec_fn_args_tuple)
|
PyObject *preexec_fn_args_tuple)
|
||||||
|
@ -507,6 +578,13 @@ child_exec(char *const exec_array[],
|
||||||
if (restore_signals)
|
if (restore_signals)
|
||||||
_Py_RestoreSignals();
|
_Py_RestoreSignals();
|
||||||
|
|
||||||
|
#ifdef VFORK_USABLE
|
||||||
|
if (child_sigmask) {
|
||||||
|
reset_signal_handlers(child_sigmask);
|
||||||
|
POSIX_CALL(pthread_sigmask(SIG_SETMASK, child_sigmask, NULL));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef HAVE_SETSID
|
#ifdef HAVE_SETSID
|
||||||
if (call_setsid)
|
if (call_setsid)
|
||||||
POSIX_CALL(setsid());
|
POSIX_CALL(setsid());
|
||||||
|
@ -599,6 +677,81 @@ error:
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* The main purpose of this wrapper function is to isolate vfork() from both
|
||||||
|
* subprocess_fork_exec() and child_exec(). A child process created via
|
||||||
|
* vfork() executes on the same stack as the parent process while the latter is
|
||||||
|
* suspended, so this function should not be inlined to avoid compiler bugs
|
||||||
|
* that might clobber data needed by the parent later. Additionally,
|
||||||
|
* child_exec() should not be inlined to avoid spurious -Wclobber warnings from
|
||||||
|
* GCC (see bpo-35823).
|
||||||
|
*/
|
||||||
|
_Py_NO_INLINE static pid_t
|
||||||
|
do_fork_exec(char *const exec_array[],
|
||||||
|
char *const argv[],
|
||||||
|
char *const envp[],
|
||||||
|
const char *cwd,
|
||||||
|
int p2cread, int p2cwrite,
|
||||||
|
int c2pread, int c2pwrite,
|
||||||
|
int errread, int errwrite,
|
||||||
|
int errpipe_read, int errpipe_write,
|
||||||
|
int close_fds, int restore_signals,
|
||||||
|
int call_setsid,
|
||||||
|
int call_setgid, gid_t gid,
|
||||||
|
int call_setgroups, size_t groups_size, const gid_t *groups,
|
||||||
|
int call_setuid, uid_t uid, int child_umask,
|
||||||
|
const void *child_sigmask,
|
||||||
|
PyObject *py_fds_to_keep,
|
||||||
|
PyObject *preexec_fn,
|
||||||
|
PyObject *preexec_fn_args_tuple)
|
||||||
|
{
|
||||||
|
|
||||||
|
pid_t pid;
|
||||||
|
|
||||||
|
#ifdef VFORK_USABLE
|
||||||
|
if (child_sigmask) {
|
||||||
|
/* These are checked by our caller; verify them in debug builds. */
|
||||||
|
assert(!call_setsid);
|
||||||
|
assert(!call_setuid);
|
||||||
|
assert(!call_setgid);
|
||||||
|
assert(!call_setgroups);
|
||||||
|
assert(preexec_fn == Py_None);
|
||||||
|
|
||||||
|
pid = vfork();
|
||||||
|
} else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
pid = fork();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pid != 0) {
|
||||||
|
return pid;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Child process.
|
||||||
|
* See the comment above child_exec() for restrictions imposed on
|
||||||
|
* the code below.
|
||||||
|
*/
|
||||||
|
|
||||||
|
if (preexec_fn != Py_None) {
|
||||||
|
/* We'll be calling back into Python later so we need to do this.
|
||||||
|
* This call may not be async-signal-safe but neither is calling
|
||||||
|
* back into Python. The user asked us to use hope as a strategy
|
||||||
|
* to avoid deadlock... */
|
||||||
|
PyOS_AfterFork_Child();
|
||||||
|
}
|
||||||
|
|
||||||
|
child_exec(exec_array, argv, envp, cwd,
|
||||||
|
p2cread, p2cwrite, c2pread, c2pwrite,
|
||||||
|
errread, errwrite, errpipe_read, errpipe_write,
|
||||||
|
close_fds, restore_signals, call_setsid,
|
||||||
|
call_setgid, gid, call_setgroups, groups_size, groups,
|
||||||
|
call_setuid, uid, child_umask, child_sigmask,
|
||||||
|
py_fds_to_keep, preexec_fn, preexec_fn_args_tuple);
|
||||||
|
_exit(255);
|
||||||
|
return 0; /* Dead code to avoid a potential compiler warning. */
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
subprocess_fork_exec(PyObject* self, PyObject *args)
|
subprocess_fork_exec(PyObject* self, PyObject *args)
|
||||||
{
|
{
|
||||||
|
@ -836,39 +989,56 @@ subprocess_fork_exec(PyObject* self, PyObject *args)
|
||||||
need_after_fork = 1;
|
need_after_fork = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
pid = fork();
|
/* NOTE: When old_sigmask is non-NULL, do_fork_exec() may use vfork(). */
|
||||||
if (pid == 0) {
|
const void *old_sigmask = NULL;
|
||||||
/* Child process */
|
#ifdef VFORK_USABLE
|
||||||
/*
|
/* Use vfork() only if it's safe. See the comment above child_exec(). */
|
||||||
* Code from here to _exit() must only use async-signal-safe functions,
|
sigset_t old_sigs;
|
||||||
* listed at `man 7 signal` or
|
if (preexec_fn == Py_None &&
|
||||||
* http://www.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_04.html.
|
!call_setuid && !call_setgid && !call_setgroups && !call_setsid) {
|
||||||
|
/* Block all signals to ensure that no signal handlers are run in the
|
||||||
|
* child process while it shares memory with us. Note that signals
|
||||||
|
* used internally by C libraries won't be blocked by
|
||||||
|
* pthread_sigmask(), but signal handlers installed by C libraries
|
||||||
|
* normally service only signals originating from *within the process*,
|
||||||
|
* so it should be sufficient to consider any library function that
|
||||||
|
* might send such a signal to be vfork-unsafe and do not call it in
|
||||||
|
* the child.
|
||||||
*/
|
*/
|
||||||
|
sigset_t all_sigs;
|
||||||
if (preexec_fn != Py_None) {
|
sigfillset(&all_sigs);
|
||||||
/* We'll be calling back into Python later so we need to do this.
|
pthread_sigmask(SIG_BLOCK, &all_sigs, &old_sigs);
|
||||||
* This call may not be async-signal-safe but neither is calling
|
old_sigmask = &old_sigs;
|
||||||
* back into Python. The user asked us to use hope as a strategy
|
|
||||||
* to avoid deadlock... */
|
|
||||||
PyOS_AfterFork_Child();
|
|
||||||
}
|
|
||||||
|
|
||||||
child_exec(exec_array, argv, envp, cwd,
|
|
||||||
p2cread, p2cwrite, c2pread, c2pwrite,
|
|
||||||
errread, errwrite, errpipe_read, errpipe_write,
|
|
||||||
close_fds, restore_signals, call_setsid,
|
|
||||||
call_setgid, gid, call_setgroups, num_groups, groups,
|
|
||||||
call_setuid, uid, child_umask,
|
|
||||||
py_fds_to_keep, preexec_fn, preexec_fn_args_tuple);
|
|
||||||
_exit(255);
|
|
||||||
return NULL; /* Dead code to avoid a potential compiler warning. */
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
pid = do_fork_exec(exec_array, argv, envp, cwd,
|
||||||
|
p2cread, p2cwrite, c2pread, c2pwrite,
|
||||||
|
errread, errwrite, errpipe_read, errpipe_write,
|
||||||
|
close_fds, restore_signals, call_setsid,
|
||||||
|
call_setgid, gid, call_setgroups, num_groups, groups,
|
||||||
|
call_setuid, uid, child_umask, old_sigmask,
|
||||||
|
py_fds_to_keep, preexec_fn, preexec_fn_args_tuple);
|
||||||
|
|
||||||
/* Parent (original) process */
|
/* Parent (original) process */
|
||||||
if (pid == -1) {
|
if (pid == -1) {
|
||||||
/* Capture errno for the exception. */
|
/* Capture errno for the exception. */
|
||||||
saved_errno = errno;
|
saved_errno = errno;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef VFORK_USABLE
|
||||||
|
if (old_sigmask) {
|
||||||
|
/* vfork() semantics guarantees that the parent is blocked
|
||||||
|
* until the child performs _exit() or execve(), so it is safe
|
||||||
|
* to unblock signals once we're here.
|
||||||
|
* Note that in environments where vfork() is implemented as fork(),
|
||||||
|
* such as QEMU user-mode emulation, the parent won't be blocked,
|
||||||
|
* but it won't share the address space with the child,
|
||||||
|
* so it's still safe to unblock the signals. */
|
||||||
|
pthread_sigmask(SIG_SETMASK, old_sigmask, NULL);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
Py_XDECREF(cwd_obj2);
|
Py_XDECREF(cwd_obj2);
|
||||||
|
|
||||||
if (need_after_fork)
|
if (need_after_fork)
|
||||||
|
|
|
@ -11732,7 +11732,7 @@ for ac_func in alarm accept4 setitimer getitimer bind_textdomain_codeset chown \
|
||||||
sigaction sigaltstack sigfillset siginterrupt sigpending sigrelse \
|
sigaction sigaltstack sigfillset siginterrupt sigpending sigrelse \
|
||||||
sigtimedwait sigwait sigwaitinfo snprintf strftime strlcpy strsignal symlinkat sync \
|
sigtimedwait sigwait sigwaitinfo snprintf strftime strlcpy strsignal symlinkat sync \
|
||||||
sysconf tcgetpgrp tcsetpgrp tempnam timegm times tmpfile tmpnam tmpnam_r \
|
sysconf tcgetpgrp tcsetpgrp tempnam timegm times tmpfile tmpnam tmpnam_r \
|
||||||
truncate uname unlinkat utimensat utimes waitid waitpid wait3 wait4 \
|
truncate uname unlinkat utimensat utimes vfork waitid waitpid wait3 wait4 \
|
||||||
wcscoll wcsftime wcsxfrm wmemcmp writev _getpty rtpSpawn
|
wcscoll wcsftime wcsxfrm wmemcmp writev _getpty rtpSpawn
|
||||||
do :
|
do :
|
||||||
as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
|
as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
|
||||||
|
|
|
@ -3690,7 +3690,7 @@ AC_CHECK_FUNCS(alarm accept4 setitimer getitimer bind_textdomain_codeset chown \
|
||||||
sigaction sigaltstack sigfillset siginterrupt sigpending sigrelse \
|
sigaction sigaltstack sigfillset siginterrupt sigpending sigrelse \
|
||||||
sigtimedwait sigwait sigwaitinfo snprintf strftime strlcpy strsignal symlinkat sync \
|
sigtimedwait sigwait sigwaitinfo snprintf strftime strlcpy strsignal symlinkat sync \
|
||||||
sysconf tcgetpgrp tcsetpgrp tempnam timegm times tmpfile tmpnam tmpnam_r \
|
sysconf tcgetpgrp tcsetpgrp tempnam timegm times tmpfile tmpnam tmpnam_r \
|
||||||
truncate uname unlinkat utimensat utimes waitid waitpid wait3 wait4 \
|
truncate uname unlinkat utimensat utimes vfork waitid waitpid wait3 wait4 \
|
||||||
wcscoll wcsftime wcsxfrm wmemcmp writev _getpty rtpSpawn)
|
wcscoll wcsftime wcsxfrm wmemcmp writev _getpty rtpSpawn)
|
||||||
|
|
||||||
# Force lchmod off for Linux. Linux disallows changing the mode of symbolic
|
# Force lchmod off for Linux. Linux disallows changing the mode of symbolic
|
||||||
|
|
|
@ -1301,6 +1301,9 @@
|
||||||
/* Define to 1 if you have the <uuid/uuid.h> header file. */
|
/* Define to 1 if you have the <uuid/uuid.h> header file. */
|
||||||
#undef HAVE_UUID_UUID_H
|
#undef HAVE_UUID_UUID_H
|
||||||
|
|
||||||
|
/* Define to 1 if you have the `vfork' function. */
|
||||||
|
#undef HAVE_VFORK
|
||||||
|
|
||||||
/* Define to 1 if you have the `wait3' function. */
|
/* Define to 1 if you have the `wait3' function. */
|
||||||
#undef HAVE_WAIT3
|
#undef HAVE_WAIT3
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue