On Thu, Jun 01, 2023 at 01:32:32PM -0500, Mike Christie wrote: > When switching from kthreads to vhost_tasks two bugs were added: > 1. The vhost worker tasks's now show up as processes so scripts doing > ps or ps a would not incorrectly detect the vhost task as another > process. 2. kthreads disabled freeze by setting PF_NOFREEZE, but > vhost tasks's didn't disable or add support for them. > > To fix both bugs, this switches the vhost task to be thread in the > process that does the VHOST_SET_OWNER ioctl, and has vhost_worker call > get_signal to support SIGKILL/SIGSTOP and freeze signals. Note that > SIGKILL/STOP support is required because CLONE_THREAD requires > CLONE_SIGHAND which requires those 2 signals to be supported. > > This is a modified version of the patch written by Mike Christie > <michael.christie@xxxxxxxxxx> which was a modified version of patch > originally written by Linus. > > Much of what depended upon PF_IO_WORKER now depends on PF_USER_WORKER. > Including ignoring signals, setting up the register state, and having > get_signal return instead of calling do_group_exit. > > Tidied up the vhost_task abstraction so that the definition of > vhost_task only needs to be visible inside of vhost_task.c. Making > it easier to review the code and tell what needs to be done where. > As part of this the main loop has been moved from vhost_worker into > vhost_task_fn. vhost_worker now returns true if work was done. > > The main loop has been updated to call get_signal which handles > SIGSTOP, freezing, and collects the message that tells the thread to > exit as part of process exit. This collection clears > __fatal_signal_pending. This collection is not guaranteed to > clear signal_pending() so clear that explicitly so the schedule() > sleeps. > > For now the vhost thread continues to exist and run work until the > last file descriptor is closed and the release function is called as > part of freeing struct file. To avoid hangs in the coredump > rendezvous and when killing threads in a multi-threaded exec. The > coredump code and de_thread have been modified to ignore vhost threads. > > Remvoing the special case for exec appears to require teaching > vhost_dev_flush how to directly complete transactions in case > the vhost thread is no longer running. > > Removing the special case for coredump rendezvous requires either the > above fix needed for exec or moving the coredump rendezvous into > get_signal. > > Fixes: 6e890c5d5021 ("vhost: use vhost_tasks for worker threads") > Signed-off-by: Eric W. Biederman <ebiederm@xxxxxxxxxxxx> > Co-developed-by: Mike Christie <michael.christie@xxxxxxxxxx> > Signed-off-by: Mike Christie <michael.christie@xxxxxxxxxx> Acked-by: Michael S. Tsirkin <mst@xxxxxxxxxx> > --- > arch/x86/include/asm/fpu/sched.h | 2 +- > arch/x86/kernel/fpu/context.h | 2 +- > arch/x86/kernel/fpu/core.c | 2 +- > drivers/vhost/vhost.c | 22 ++------ > fs/coredump.c | 4 +- > include/linux/sched/task.h | 1 - > include/linux/sched/vhost_task.h | 15 ++---- > kernel/exit.c | 5 +- > kernel/fork.c | 13 ++--- > kernel/signal.c | 8 +-- > kernel/vhost_task.c | 92 +++++++++++++++++++++----------- > 11 files changed, 89 insertions(+), 77 deletions(-) > > diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h > index c2d6cd78ed0c..78fcde7b1f07 100644 > --- a/arch/x86/include/asm/fpu/sched.h > +++ b/arch/x86/include/asm/fpu/sched.h > @@ -39,7 +39,7 @@ extern void fpu_flush_thread(void); > static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) > { > if (cpu_feature_enabled(X86_FEATURE_FPU) && > - !(current->flags & (PF_KTHREAD | PF_IO_WORKER))) { > + !(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { > save_fpregs_to_fpstate(old_fpu); > /* > * The save operation preserved register state, so the > diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h > index 9fcfa5c4dad7..af5cbdd9bd29 100644 > --- a/arch/x86/kernel/fpu/context.h > +++ b/arch/x86/kernel/fpu/context.h > @@ -57,7 +57,7 @@ static inline void fpregs_restore_userregs(void) > struct fpu *fpu = ¤t->thread.fpu; > int cpu = smp_processor_id(); > > - if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_IO_WORKER))) > + if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER))) > return; > > if (!fpregs_state_valid(fpu, cpu)) { > diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c > index caf33486dc5e..1015af1ae562 100644 > --- a/arch/x86/kernel/fpu/core.c > +++ b/arch/x86/kernel/fpu/core.c > @@ -426,7 +426,7 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask) > > this_cpu_write(in_kernel_fpu, true); > > - if (!(current->flags & (PF_KTHREAD | PF_IO_WORKER)) && > + if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) && > !test_thread_flag(TIF_NEED_FPU_LOAD)) { > set_thread_flag(TIF_NEED_FPU_LOAD); > save_fpregs_to_fpstate(¤t->thread.fpu); > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index a92af08e7864..074273020849 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -256,7 +256,7 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) > * test_and_set_bit() implies a memory barrier. > */ > llist_add(&work->node, &dev->worker->work_list); > - wake_up_process(dev->worker->vtsk->task); > + vhost_task_wake(dev->worker->vtsk); > } > } > EXPORT_SYMBOL_GPL(vhost_work_queue); > @@ -333,31 +333,19 @@ static void vhost_vq_reset(struct vhost_dev *dev, > __vhost_vq_meta_reset(vq); > } > > -static int vhost_worker(void *data) > +static bool vhost_worker(void *data) > { > struct vhost_worker *worker = data; > struct vhost_work *work, *work_next; > struct llist_node *node; > > - for (;;) { > - /* mb paired w/ kthread_stop */ > - set_current_state(TASK_INTERRUPTIBLE); > - > - if (vhost_task_should_stop(worker->vtsk)) { > - __set_current_state(TASK_RUNNING); > - break; > - } > - > - node = llist_del_all(&worker->work_list); > - if (!node) > - schedule(); > - > + node = llist_del_all(&worker->work_list); > + if (node) { > node = llist_reverse_order(node); > /* make sure flag is seen after deletion */ > smp_wmb(); > llist_for_each_entry_safe(work, work_next, node, node) { > clear_bit(VHOST_WORK_QUEUED, &work->flags); > - __set_current_state(TASK_RUNNING); > kcov_remote_start_common(worker->kcov_handle); > work->fn(work); > kcov_remote_stop(); > @@ -365,7 +353,7 @@ static int vhost_worker(void *data) > } > } > > - return 0; > + return !!node; > } > > static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq) > diff --git a/fs/coredump.c b/fs/coredump.c > index ece7badf701b..88740c51b942 100644 > --- a/fs/coredump.c > +++ b/fs/coredump.c > @@ -371,7 +371,9 @@ static int zap_process(struct task_struct *start, int exit_code) > if (t != current && !(t->flags & PF_POSTCOREDUMP)) { > sigaddset(&t->pending.signal, SIGKILL); > signal_wake_up(t, 1); > - nr++; > + /* The vhost_worker does not particpate in coredumps */ > + if ((t->flags & (PF_USER_WORKER | PF_IO_WORKER)) != PF_USER_WORKER) > + nr++; > } > } > > diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h > index 537cbf9a2ade..e0f5ac90a228 100644 > --- a/include/linux/sched/task.h > +++ b/include/linux/sched/task.h > @@ -29,7 +29,6 @@ struct kernel_clone_args { > u32 io_thread:1; > u32 user_worker:1; > u32 no_files:1; > - u32 ignore_signals:1; > unsigned long stack; > unsigned long stack_size; > unsigned long tls; > diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h > index 6123c10b99cf..837a23624a66 100644 > --- a/include/linux/sched/vhost_task.h > +++ b/include/linux/sched/vhost_task.h > @@ -2,22 +2,13 @@ > #ifndef _LINUX_VHOST_TASK_H > #define _LINUX_VHOST_TASK_H > > -#include <linux/completion.h> > > -struct task_struct; > +struct vhost_task; > > -struct vhost_task { > - int (*fn)(void *data); > - void *data; > - struct completion exited; > - unsigned long flags; > - struct task_struct *task; > -}; > - > -struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg, > +struct vhost_task *vhost_task_create(bool (*fn)(void *), void *arg, > const char *name); > void vhost_task_start(struct vhost_task *vtsk); > void vhost_task_stop(struct vhost_task *vtsk); > -bool vhost_task_should_stop(struct vhost_task *vtsk); > +void vhost_task_wake(struct vhost_task *vtsk); > > #endif > diff --git a/kernel/exit.c b/kernel/exit.c > index 34b90e2e7cf7..edb50b4c9972 100644 > --- a/kernel/exit.c > +++ b/kernel/exit.c > @@ -411,7 +411,10 @@ static void coredump_task_exit(struct task_struct *tsk) > tsk->flags |= PF_POSTCOREDUMP; > core_state = tsk->signal->core_state; > spin_unlock_irq(&tsk->sighand->siglock); > - if (core_state) { > + > + /* The vhost_worker does not particpate in coredumps */ > + if (core_state && > + ((tsk->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER)) { > struct core_thread self; > > self.task = current; > diff --git a/kernel/fork.c b/kernel/fork.c > index ed4e01daccaa..81cba91f30bb 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -2336,16 +2336,16 @@ __latent_entropy struct task_struct *copy_process( > p->flags &= ~PF_KTHREAD; > if (args->kthread) > p->flags |= PF_KTHREAD; > - if (args->user_worker) > - p->flags |= PF_USER_WORKER; > - if (args->io_thread) { > + if (args->user_worker) { > /* > - * Mark us an IO worker, and block any signal that isn't > + * Mark us a user worker, and block any signal that isn't > * fatal or STOP > */ > - p->flags |= PF_IO_WORKER; > + p->flags |= PF_USER_WORKER; > siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); > } > + if (args->io_thread) > + p->flags |= PF_IO_WORKER; > > if (args->name) > strscpy_pad(p->comm, args->name, sizeof(p->comm)); > @@ -2517,9 +2517,6 @@ __latent_entropy struct task_struct *copy_process( > if (retval) > goto bad_fork_cleanup_io; > > - if (args->ignore_signals) > - ignore_signals(p); > - > stackleak_task_init(p); > > if (pid != &init_struct_pid) { > diff --git a/kernel/signal.c b/kernel/signal.c > index 8f6330f0e9ca..2547fa73bde5 100644 > --- a/kernel/signal.c > +++ b/kernel/signal.c > @@ -1368,7 +1368,9 @@ int zap_other_threads(struct task_struct *p) > > while_each_thread(p, t) { > task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); > - count++; > + /* Don't require de_thread to wait for the vhost_worker */ > + if ((t->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER) > + count++; > > /* Don't bother with already dead threads */ > if (t->exit_state) > @@ -2861,11 +2863,11 @@ bool get_signal(struct ksignal *ksig) > } > > /* > - * PF_IO_WORKER threads will catch and exit on fatal signals > + * PF_USER_WORKER threads will catch and exit on fatal signals > * themselves. They have cleanup that must be performed, so > * we cannot call do_exit() on their behalf. > */ > - if (current->flags & PF_IO_WORKER) > + if (current->flags & PF_USER_WORKER) > goto out; > > /* > diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c > index b7cbd66f889e..f80d5c51ae67 100644 > --- a/kernel/vhost_task.c > +++ b/kernel/vhost_task.c > @@ -12,58 +12,88 @@ enum vhost_task_flags { > VHOST_TASK_FLAGS_STOP, > }; > > +struct vhost_task { > + bool (*fn)(void *data); > + void *data; > + struct completion exited; > + unsigned long flags; > + struct task_struct *task; > +}; > + > static int vhost_task_fn(void *data) > { > struct vhost_task *vtsk = data; > - int ret; > + bool dead = false; > + > + for (;;) { > + bool did_work; > + > + /* mb paired w/ vhost_task_stop */ > + if (test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags)) > + break; > + > + if (!dead && signal_pending(current)) { > + struct ksignal ksig; > + /* > + * Calling get_signal will block in SIGSTOP, > + * or clear fatal_signal_pending, but remember > + * what was set. > + * > + * This thread won't actually exit until all > + * of the file descriptors are closed, and > + * the release function is called. > + */ > + dead = get_signal(&ksig); > + if (dead) > + clear_thread_flag(TIF_SIGPENDING); > + } > + > + did_work = vtsk->fn(vtsk->data); > + if (!did_work) { > + set_current_state(TASK_INTERRUPTIBLE); > + schedule(); > + } > + } > > - ret = vtsk->fn(vtsk->data); > complete(&vtsk->exited); > - do_exit(ret); > + do_exit(0); > +} > + > +/** > + * vhost_task_wake - wakeup the vhost_task > + * @vtsk: vhost_task to wake > + * > + * wake up the vhost_task worker thread > + */ > +void vhost_task_wake(struct vhost_task *vtsk) > +{ > + wake_up_process(vtsk->task); > } > +EXPORT_SYMBOL_GPL(vhost_task_wake); > > /** > * vhost_task_stop - stop a vhost_task > * @vtsk: vhost_task to stop > * > - * Callers must call vhost_task_should_stop and return from their worker > - * function when it returns true; > + * vhost_task_fn ensures the worker thread exits after > + * VHOST_TASK_FLAGS_SOP becomes true. > */ > void vhost_task_stop(struct vhost_task *vtsk) > { > - pid_t pid = vtsk->task->pid; > - > set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); > - wake_up_process(vtsk->task); > + vhost_task_wake(vtsk); > /* > * Make sure vhost_task_fn is no longer accessing the vhost_task before > - * freeing it below. If userspace crashed or exited without closing, > - * then the vhost_task->task could already be marked dead so > - * kernel_wait will return early. > + * freeing it below. > */ > wait_for_completion(&vtsk->exited); > - /* > - * If we are just closing/removing a device and the parent process is > - * not exiting then reap the task. > - */ > - kernel_wait4(pid, NULL, __WCLONE, NULL); > kfree(vtsk); > } > EXPORT_SYMBOL_GPL(vhost_task_stop); > > /** > - * vhost_task_should_stop - should the vhost task return from the work function > - * @vtsk: vhost_task to stop > - */ > -bool vhost_task_should_stop(struct vhost_task *vtsk) > -{ > - return test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); > -} > -EXPORT_SYMBOL_GPL(vhost_task_should_stop); > - > -/** > - * vhost_task_create - create a copy of a process to be used by the kernel > - * @fn: thread stack > + * vhost_task_create - create a copy of a task to be used by the kernel > + * @fn: vhost worker function > * @arg: data to be passed to fn > * @name: the thread's name > * > @@ -71,17 +101,17 @@ EXPORT_SYMBOL_GPL(vhost_task_should_stop); > * failure. The returned task is inactive, and the caller must fire it up > * through vhost_task_start(). > */ > -struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg, > +struct vhost_task *vhost_task_create(bool (*fn)(void *), void *arg, > const char *name) > { > struct kernel_clone_args args = { > - .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM, > + .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM | > + CLONE_THREAD | CLONE_SIGHAND, > .exit_signal = 0, > .fn = vhost_task_fn, > .name = name, > .user_worker = 1, > .no_files = 1, > - .ignore_signals = 1, > }; > struct vhost_task *vtsk; > struct task_struct *tsk; > -- > 2.25.1 _______________________________________________ Virtualization mailing list Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/virtualization