diff --git a/Misc/NEWS.d/next/Library/2020-10-16-07-45-35.bpo-35823.SNQo56.rst b/Misc/NEWS.d/next/Library/2020-10-16-07-45-35.bpo-35823.SNQo56.rst new file mode 100644 index 00000000000..cd428d36960 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2020-10-16-07-45-35.bpo-35823.SNQo56.rst @@ -0,0 +1,2 @@ +Use ``vfork()`` instead of ``fork()`` for :func:`subprocess.Popen` on Linux +to improve performance in cases where it is deemed safe. diff --git a/Modules/_posixsubprocess.c b/Modules/_posixsubprocess.c index d08c47980e9..ed498572a82 100644 --- a/Modules/_posixsubprocess.c +++ b/Modules/_posixsubprocess.c @@ -36,6 +36,12 @@ # define SYS_getdents64 __NR_getdents64 #endif +#if defined(__linux__) && defined(HAVE_VFORK) && defined(HAVE_SIGNAL_H) && \ + defined(HAVE_PTHREAD_SIGMASK) && !defined(HAVE_BROKEN_PTHREAD_SIGMASK) +# include +# define VFORK_USABLE 1 +#endif + #if defined(__sun) && defined(__SVR4) /* readdir64 is used to work around Solaris 9 bug 6395699. */ # define readdir readdir64 @@ -407,9 +413,53 @@ _close_open_fds_maybe_unsafe(long start_fd, PyObject* py_fds_to_keep) #endif /* else NOT (defined(__linux__) && defined(HAVE_SYS_SYSCALL_H)) */ +#ifdef VFORK_USABLE +/* Reset dispositions for all signals to SIG_DFL except for ignored + * signals. This way we ensure that no signal handlers can run + * after we unblock signals in a child created by vfork(). + */ +static void +reset_signal_handlers(const sigset_t *child_sigmask) +{ + struct sigaction sa_dfl = {.sa_handler = SIG_DFL}; + for (int sig = 1; sig < _NSIG; sig++) { + /* Dispositions for SIGKILL and SIGSTOP can't be changed. */ + if (sig == SIGKILL || sig == SIGSTOP) { + continue; + } + + /* There is no need to reset the disposition of signals that will + * remain blocked across execve() since the kernel will do it. */ + if (sigismember(child_sigmask, sig) == 1) { + continue; + } + + struct sigaction sa; + /* C libraries usually return EINVAL for signals used + * internally (e.g. for thread cancellation), so simply + * skip errors here. */ + if (sigaction(sig, NULL, &sa) == -1) { + continue; + } + + /* void *h works as these fields are both pointer types already. */ + void *h = (sa.sa_flags & SA_SIGINFO ? (void *)sa.sa_sigaction : + (void *)sa.sa_handler); + if (h == SIG_IGN || h == SIG_DFL) { + continue; + } + + /* This call can't reasonably fail, but if it does, terminating + * the child seems to be too harsh, so ignore errors. */ + (void) sigaction(sig, &sa_dfl, NULL); + } +} +#endif /* VFORK_USABLE */ + + /* - * This function is code executed in the child process immediately after fork - * to set things up and call exec(). + * This function is code executed in the child process immediately after + * (v)fork to set things up and call exec(). * * All of the code in this function must only use async-signal-safe functions, * listed at `man 7 signal` or @@ -417,8 +467,28 @@ _close_open_fds_maybe_unsafe(long start_fd, PyObject* py_fds_to_keep) * * This restriction is documented at * http://www.opengroup.org/onlinepubs/009695399/functions/fork.html. + * + * If this function is called after vfork(), even more care must be taken. + * The lack of preparations that C libraries normally take on fork(), + * as well as sharing the address space with the parent, might make even + * async-signal-safe functions vfork-unsafe. In particular, on Linux, + * set*id() and setgroups() library functions must not be called, since + * they have to interact with the library-level thread list and send + * library-internal signals to implement per-process credentials semantics + * required by POSIX but not supported natively on Linux. Another reason to + * avoid this family of functions is that sharing an address space between + * processes running with different privileges is inherently insecure. + * See bpo-35823 for further discussion and references. + * + * In some C libraries, setrlimit() has the same thread list/signalling + * behavior since resource limits were per-thread attributes before + * Linux 2.6.10. Musl, as of 1.2.1, is known to have this issue + * (https://www.openwall.com/lists/musl/2020/10/15/6). + * + * If vfork-unsafe functionality is desired after vfork(), consider using + * syscall() to obtain it. */ -static void +_Py_NO_INLINE static void child_exec(char *const exec_array[], char *const argv[], char *const envp[], @@ -432,6 +502,7 @@ child_exec(char *const exec_array[], int call_setgid, gid_t gid, int call_setgroups, size_t groups_size, const gid_t *groups, int call_setuid, uid_t uid, int child_umask, + const void *child_sigmask, PyObject *py_fds_to_keep, PyObject *preexec_fn, PyObject *preexec_fn_args_tuple) @@ -507,6 +578,13 @@ child_exec(char *const exec_array[], if (restore_signals) _Py_RestoreSignals(); +#ifdef VFORK_USABLE + if (child_sigmask) { + reset_signal_handlers(child_sigmask); + POSIX_CALL(pthread_sigmask(SIG_SETMASK, child_sigmask, NULL)); + } +#endif + #ifdef HAVE_SETSID if (call_setsid) POSIX_CALL(setsid()); @@ -599,6 +677,81 @@ error: } +/* The main purpose of this wrapper function is to isolate vfork() from both + * subprocess_fork_exec() and child_exec(). A child process created via + * vfork() executes on the same stack as the parent process while the latter is + * suspended, so this function should not be inlined to avoid compiler bugs + * that might clobber data needed by the parent later. Additionally, + * child_exec() should not be inlined to avoid spurious -Wclobber warnings from + * GCC (see bpo-35823). + */ +_Py_NO_INLINE static pid_t +do_fork_exec(char *const exec_array[], + char *const argv[], + char *const envp[], + const char *cwd, + int p2cread, int p2cwrite, + int c2pread, int c2pwrite, + int errread, int errwrite, + int errpipe_read, int errpipe_write, + int close_fds, int restore_signals, + int call_setsid, + int call_setgid, gid_t gid, + int call_setgroups, size_t groups_size, const gid_t *groups, + int call_setuid, uid_t uid, int child_umask, + const void *child_sigmask, + PyObject *py_fds_to_keep, + PyObject *preexec_fn, + PyObject *preexec_fn_args_tuple) +{ + + pid_t pid; + +#ifdef VFORK_USABLE + if (child_sigmask) { + /* These are checked by our caller; verify them in debug builds. */ + assert(!call_setsid); + assert(!call_setuid); + assert(!call_setgid); + assert(!call_setgroups); + assert(preexec_fn == Py_None); + + pid = vfork(); + } else +#endif + { + pid = fork(); + } + + if (pid != 0) { + return pid; + } + + /* Child process. + * See the comment above child_exec() for restrictions imposed on + * the code below. + */ + + if (preexec_fn != Py_None) { + /* We'll be calling back into Python later so we need to do this. + * This call may not be async-signal-safe but neither is calling + * back into Python. The user asked us to use hope as a strategy + * to avoid deadlock... */ + PyOS_AfterFork_Child(); + } + + child_exec(exec_array, argv, envp, cwd, + p2cread, p2cwrite, c2pread, c2pwrite, + errread, errwrite, errpipe_read, errpipe_write, + close_fds, restore_signals, call_setsid, + call_setgid, gid, call_setgroups, groups_size, groups, + call_setuid, uid, child_umask, child_sigmask, + py_fds_to_keep, preexec_fn, preexec_fn_args_tuple); + _exit(255); + return 0; /* Dead code to avoid a potential compiler warning. */ +} + + static PyObject * subprocess_fork_exec(PyObject* self, PyObject *args) { @@ -836,39 +989,56 @@ subprocess_fork_exec(PyObject* self, PyObject *args) need_after_fork = 1; } - pid = fork(); - if (pid == 0) { - /* Child process */ - /* - * Code from here to _exit() must only use async-signal-safe functions, - * listed at `man 7 signal` or - * http://www.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_04.html. + /* NOTE: When old_sigmask is non-NULL, do_fork_exec() may use vfork(). */ + const void *old_sigmask = NULL; +#ifdef VFORK_USABLE + /* Use vfork() only if it's safe. See the comment above child_exec(). */ + sigset_t old_sigs; + if (preexec_fn == Py_None && + !call_setuid && !call_setgid && !call_setgroups && !call_setsid) { + /* Block all signals to ensure that no signal handlers are run in the + * child process while it shares memory with us. Note that signals + * used internally by C libraries won't be blocked by + * pthread_sigmask(), but signal handlers installed by C libraries + * normally service only signals originating from *within the process*, + * so it should be sufficient to consider any library function that + * might send such a signal to be vfork-unsafe and do not call it in + * the child. */ - - if (preexec_fn != Py_None) { - /* We'll be calling back into Python later so we need to do this. - * This call may not be async-signal-safe but neither is calling - * back into Python. The user asked us to use hope as a strategy - * to avoid deadlock... */ - PyOS_AfterFork_Child(); - } - - child_exec(exec_array, argv, envp, cwd, - p2cread, p2cwrite, c2pread, c2pwrite, - errread, errwrite, errpipe_read, errpipe_write, - close_fds, restore_signals, call_setsid, - call_setgid, gid, call_setgroups, num_groups, groups, - call_setuid, uid, child_umask, - py_fds_to_keep, preexec_fn, preexec_fn_args_tuple); - _exit(255); - return NULL; /* Dead code to avoid a potential compiler warning. */ + sigset_t all_sigs; + sigfillset(&all_sigs); + pthread_sigmask(SIG_BLOCK, &all_sigs, &old_sigs); + old_sigmask = &old_sigs; } +#endif + + pid = do_fork_exec(exec_array, argv, envp, cwd, + p2cread, p2cwrite, c2pread, c2pwrite, + errread, errwrite, errpipe_read, errpipe_write, + close_fds, restore_signals, call_setsid, + call_setgid, gid, call_setgroups, num_groups, groups, + call_setuid, uid, child_umask, old_sigmask, + py_fds_to_keep, preexec_fn, preexec_fn_args_tuple); + /* Parent (original) process */ if (pid == -1) { /* Capture errno for the exception. */ saved_errno = errno; } +#ifdef VFORK_USABLE + if (old_sigmask) { + /* vfork() semantics guarantees that the parent is blocked + * until the child performs _exit() or execve(), so it is safe + * to unblock signals once we're here. + * Note that in environments where vfork() is implemented as fork(), + * such as QEMU user-mode emulation, the parent won't be blocked, + * but it won't share the address space with the child, + * so it's still safe to unblock the signals. */ + pthread_sigmask(SIG_SETMASK, old_sigmask, NULL); + } +#endif + Py_XDECREF(cwd_obj2); if (need_after_fork) diff --git a/configure b/configure index 29f33b543ec..bc87485bf51 100755 --- a/configure +++ b/configure @@ -11732,7 +11732,7 @@ for ac_func in alarm accept4 setitimer getitimer bind_textdomain_codeset chown \ sigaction sigaltstack sigfillset siginterrupt sigpending sigrelse \ sigtimedwait sigwait sigwaitinfo snprintf strftime strlcpy strsignal symlinkat sync \ sysconf tcgetpgrp tcsetpgrp tempnam timegm times tmpfile tmpnam tmpnam_r \ - truncate uname unlinkat utimensat utimes waitid waitpid wait3 wait4 \ + truncate uname unlinkat utimensat utimes vfork waitid waitpid wait3 wait4 \ wcscoll wcsftime wcsxfrm wmemcmp writev _getpty rtpSpawn do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` diff --git a/configure.ac b/configure.ac index 9698c3c888a..49ed09a3a00 100644 --- a/configure.ac +++ b/configure.ac @@ -3690,7 +3690,7 @@ AC_CHECK_FUNCS(alarm accept4 setitimer getitimer bind_textdomain_codeset chown \ sigaction sigaltstack sigfillset siginterrupt sigpending sigrelse \ sigtimedwait sigwait sigwaitinfo snprintf strftime strlcpy strsignal symlinkat sync \ sysconf tcgetpgrp tcsetpgrp tempnam timegm times tmpfile tmpnam tmpnam_r \ - truncate uname unlinkat utimensat utimes waitid waitpid wait3 wait4 \ + truncate uname unlinkat utimensat utimes vfork waitid waitpid wait3 wait4 \ wcscoll wcsftime wcsxfrm wmemcmp writev _getpty rtpSpawn) # Force lchmod off for Linux. Linux disallows changing the mode of symbolic diff --git a/pyconfig.h.in b/pyconfig.h.in index 298cb4fa12f..af8a3d6d9c3 100644 --- a/pyconfig.h.in +++ b/pyconfig.h.in @@ -1301,6 +1301,9 @@ /* Define to 1 if you have the header file. */ #undef HAVE_UUID_UUID_H +/* Define to 1 if you have the `vfork' function. */ +#undef HAVE_VFORK + /* Define to 1 if you have the `wait3' function. */ #undef HAVE_WAIT3