On 5/15/23 5:54 PM, Linus Torvalds wrote: > On Mon, May 15, 2023 at 3:23 PM Mike Christie > <michael.christie@xxxxxxxxxx> wrote: >> >> The vhost layer really doesn't want any signals and wants to work like kthreads >> for that case. To make it really simple can we do something like this where it >> separates user and io worker behavior where the major diff is how they handle >> signals and exit. I also included a fix for the freeze case: > > I don't love the SIGKILL special case, but I also don't find this > deeply offensive. So if this is what it takes, I'm ok with it. > > I wonder if we could make that special case simply check for "is > SIGKILL blocked" instead? No normal case will cause that, and it means Yeah, it's doable. Updated below. > that a PF_USER_WORKER thread could decide per-thread what it wants to > do wrt SIGKILL. > > Christian? And I guess we should Cc: Oleg too, since the signal parts > are an area he's familiar with and has worked on.. Eric Biederman has > already been on the list and has also been involved > > Oleg: see > > https://lore.kernel.org/lkml/122b597e-a5fa-daf7-27bb-6f04fa98d496@xxxxxxxxxx/ > > for the context here. Oleg and Christian, Below is an updated patch that doesn't check for PF_USER_WORKER in the signal.c code and instead will check for if we have blocked the signal. diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 537cbf9a2ade..e0f5ac90a228 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -29,7 +29,6 @@ struct kernel_clone_args { u32 io_thread:1; u32 user_worker:1; u32 no_files:1; - u32 ignore_signals:1; unsigned long stack; unsigned long stack_size; unsigned long tls; diff --git a/kernel/fork.c b/kernel/fork.c index ed4e01daccaa..fd2970b598b2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2336,8 +2336,15 @@ __latent_entropy struct task_struct *copy_process( p->flags &= ~PF_KTHREAD; if (args->kthread) p->flags |= PF_KTHREAD; - if (args->user_worker) + if (args->user_worker) { + /* + * User worker are similar to io_threads but they do not + * support signals and cleanup is driven via another kernel + * interface so even SIGKILL is blocked. + */ p->flags |= PF_USER_WORKER; + siginitsetinv(&p->blocked, 0); + } if (args->io_thread) { /* * Mark us an IO worker, and block any signal that isn't @@ -2517,8 +2524,8 @@ __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_cleanup_io; - if (args->ignore_signals) - ignore_signals(p); + if (args->user_worker) + p->flags |= PF_NOFREEZE; stackleak_task_init(p); @@ -2860,7 +2867,6 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) .fn = fn, .fn_arg = arg, .io_thread = 1, - .user_worker = 1, }; return copy_process(NULL, 0, node, &args); diff --git a/kernel/signal.c b/kernel/signal.c index 8f6330f0e9ca..bc7e26072437 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -995,6 +995,19 @@ static inline bool wants_signal(int sig, struct task_struct *p) return task_curr(p) || !task_sigpending(p); } +static void try_set_pending_sigkill(struct task_struct *t) +{ + /* + * User workers don't support signals and their exit is driven through + * their kernel layer, so by default block even SIGKILL. + */ + if (sigismember(&t->blocked, SIGKILL)) + return; + + sigaddset(&t->pending.signal, SIGKILL); + signal_wake_up(t, 1); +} + static void complete_signal(int sig, struct task_struct *p, enum pid_type type) { struct signal_struct *signal = p->signal; @@ -1055,8 +1068,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type) t = p; do { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); - sigaddset(&t->pending.signal, SIGKILL); - signal_wake_up(t, 1); + try_set_pending_sigkill(t); } while_each_thread(p, t); return; } @@ -1373,8 +1385,7 @@ int zap_other_threads(struct task_struct *p) /* Don't bother with already dead threads */ if (t->exit_state) continue; - sigaddset(&t->pending.signal, SIGKILL); - signal_wake_up(t, 1); + try_set_pending_sigkill(t); } return count; diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index b7cbd66f889e..2d8d3ebaec4d 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -75,13 +75,13 @@ struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg, const char *name) { struct kernel_clone_args args = { - .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM, + .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM | + CLONE_THREAD | CLONE_SIGHAND, .exit_signal = 0, .fn = vhost_task_fn, .name = name, .user_worker = 1, .no_files = 1, - .ignore_signals = 1, }; struct vhost_task *vtsk; struct task_struct *tsk; diff --git a/mm/vmscan.c b/mm/vmscan.c index d257916f39e5..255a2147e5c1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1207,12 +1207,12 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) DEFINE_WAIT(wait); /* - * Do not throttle user workers, kthreads other than kswapd or + * Do not throttle IO/user workers, kthreads other than kswapd or * workqueues. They may be required for reclaim to make * forward progress (e.g. journalling workqueues or kthreads). */ if (!current_is_kswapd() && - current->flags & (PF_USER_WORKER|PF_KTHREAD)) { + current->flags & (PF_USER_WORKER|PF_IO_WORKER|PF_KTHREAD)) { cond_resched(); return; } _______________________________________________ Virtualization mailing list Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/virtualization