On 3/4/21 10:32 AM, Jens Axboe wrote: > On 3/4/21 10:09 AM, Stefan Metzmacher wrote: >> >> Am 04.03.21 um 17:42 schrieb Jens Axboe: >>> On 3/4/21 9:13 AM, Stefan Metzmacher wrote: >>>> >>>> Am 04.03.21 um 14:19 schrieb Stefan Metzmacher: >>>>> Hi Jens, >>>>> >>>>>>> Can you please explain why CLONE_SIGHAND is used here? >>>>>> >>>>>> We can't have CLONE_THREAD without CLONE_SIGHAND... The io-wq workers >>>>>> don't really care about signals, we don't use them internally. >>>>> >>>>> I'm 100% sure, but I heard rumors that in some situations signals get >>>>> randomly delivered to any thread of a userspace process. >>>> >>>> Ok, from task_struct: >>>> >>>> /* Signal handlers: */ >>>> struct signal_struct *signal; >>>> struct sighand_struct __rcu *sighand; >>>> sigset_t blocked; >>>> sigset_t real_blocked; >>>> /* Restored if set_restore_sigmask() was used: */ >>>> sigset_t saved_sigmask; >>>> struct sigpending pending; >>>> >>>> The signal handlers are shared, but 'blocked' is per thread/task. >>>> >>>>> My fear was that the related logic may select a kernel thread if they >>>>> share the same signal handlers. >>>> >>>> I found the related logic in the interaction between >>>> complete_signal() and wants_signal(). >>>> >>>> static inline bool wants_signal(int sig, struct task_struct *p) >>>> { >>>> if (sigismember(&p->blocked, sig)) >>>> return false; >>>> >>>> ... >>>> >>>> Would it make sense to set up task->blocked to block all signals? >>>> >>>> Something like this: >>>> >>>> --- a/fs/io-wq.c >>>> +++ b/fs/io-wq.c >>>> @@ -611,15 +611,15 @@ pid_t io_wq_fork_thread(int (*fn)(void *), void *arg) >>>> { >>>> unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| >>>> CLONE_IO|SIGCHLD; >>>> - struct kernel_clone_args args = { >>>> - .flags = ((lower_32_bits(flags) | CLONE_VM | >>>> - CLONE_UNTRACED) & ~CSIGNAL), >>>> - .exit_signal = (lower_32_bits(flags) & CSIGNAL), >>>> - .stack = (unsigned long)fn, >>>> - .stack_size = (unsigned long)arg, >>>> - }; >>>> + sigset_t mask, oldmask; >>>> + pid_t pid; >>>> >>>> - return kernel_clone(&args); >>>> + sigfillset(&mask); >>>> + sigprocmask(SIG_BLOCK, &mask, &oldmask); >>>> + pid = kernel_thread(fn, arg, flags); >>>> + sigprocmask(SIG_SETMASK, &oldmask, NULL); >>>> + >>>> + return ret; >>>> } >>>> >>>> I think using kernel_thread() would be a good simplification anyway. >>> >>> I like this approach, we're really not interested in signals for those >>> threads, and this makes it explicit. Ditto on just using the kernel_thread() >>> helper, looks fine too. I'll run this through the testing. Do you want to >>> send this as a "real" patch, or should I just attribute you in the commit >>> message? >> >> You can do the patch, it was mostly an example. >> I'm not sure if sigprocmask() is the correct function here. >> >> Or if we better use something like this: >> >> set_restore_sigmask(); >> current->saved_sigmask = current->blocked; >> set_current_blocked(&kmask); >> pid = kernel_thread(fn, arg, flags); >> restore_saved_sigmask(); > > Might be cleaner, and allows fatal signals. How about this - it moves the signal fiddling into the task itself, and leaves the parent alone. Also allows future cleanups of how we wait for thread creation. And moves the PF_IO_WORKER into a contained spot, instead of having it throughout where we call the worker fork. Later cleanups can focus on having copy_process() do the right thing and we can kill this PF_IO_WORKER dance completely commit eeb485abb7a189058858f941fb3432bee945a861 Author: Jens Axboe <axboe@xxxxxxxxx> Date: Thu Mar 4 09:46:37 2021 -0700 io-wq: block signals by default for any io-wq worker We're not interested in signals, so let's make it explicit and block it for any worker. Wrap the thread creation in our own handler, so we can set blocked signals and serialize creation of them. A future cleanup can now simplify the waiting on thread creation from the other paths, just pushing the 'startup' completion further down. Move the PF_IO_WORKER flag dance in there as well. This will go away in the future when we teach copy_process() how to deal with this automatically. Reported-by: Stefan Metzmacher <metze@xxxxxxxxx> Signed-off-by: Jens Axboe <axboe@xxxxxxxxx> diff --git a/fs/io-wq.c b/fs/io-wq.c index 19f18389ead2..bf5df1a31a0e 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -607,19 +607,54 @@ static int task_thread_unbound(void *data) return task_thread(data, IO_WQ_ACCT_UNBOUND); } +struct thread_start_data { + struct completion startup; + int (*fn)(void *); + void *arg; +}; + +static int thread_start(void *data) +{ + struct thread_start_data *d = data; + int (*fn)(void *) = d->fn; + void *arg = d->arg; + sigset_t mask; + int ret; + + sigfillset(&mask); + set_restore_sigmask(); + current->saved_sigmask = current->blocked; + set_current_blocked(&mask); + current->flags |= PF_IO_WORKER; + complete(&d->startup); + ret = fn(arg); + restore_saved_sigmask(); + return ret; +} + pid_t io_wq_fork_thread(int (*fn)(void *), void *arg) { unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| CLONE_IO|SIGCHLD; - struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), - .stack = (unsigned long)fn, - .stack_size = (unsigned long)arg, + struct thread_start_data data = { + .startup = COMPLETION_INITIALIZER_ONSTACK(data.startup), + .fn = fn, + .arg = arg }; + bool clear = false; + pid_t pid; - return kernel_clone(&args); + /* task path doesn't have it, manager path does */ + if (!(current->flags & PF_IO_WORKER)) { + current->flags |= PF_IO_WORKER; + clear = true; + } + pid = kernel_thread(thread_start, &data, flags); + if (clear) + current->flags &= ~PF_IO_WORKER; + if (pid >= 0) + wait_for_completion(&data.startup); + return pid; } static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) @@ -752,7 +787,6 @@ static int io_wq_manager(void *data) sprintf(buf, "iou-mgr-%d", wq->task_pid); set_task_comm(current, buf); - current->flags |= PF_IO_WORKER; wq->manager = get_task_struct(current); complete(&wq->started); @@ -821,9 +855,7 @@ static int io_wq_fork_manager(struct io_wq *wq) return 0; reinit_completion(&wq->worker_done); - current->flags |= PF_IO_WORKER; ret = io_wq_fork_thread(io_wq_manager, wq); - current->flags &= ~PF_IO_WORKER; if (ret >= 0) { wait_for_completion(&wq->started); return 0; diff --git a/fs/io_uring.c b/fs/io_uring.c index e55369555e5c..995f506e3a60 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7824,9 +7824,7 @@ static int io_sq_thread_fork(struct io_sq_data *sqd, struct io_ring_ctx *ctx) reinit_completion(&sqd->completion); ctx->sqo_exec = 0; sqd->task_pid = current->pid; - current->flags |= PF_IO_WORKER; ret = io_wq_fork_thread(io_sq_thread, sqd); - current->flags &= ~PF_IO_WORKER; if (ret < 0) { sqd->thread = NULL; return ret; @@ -7896,9 +7894,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, } sqd->task_pid = current->pid; - current->flags |= PF_IO_WORKER; ret = io_wq_fork_thread(io_sq_thread, sqd); - current->flags &= ~PF_IO_WORKER; if (ret < 0) { sqd->thread = NULL; goto err; -- Jens Axboe