Qemu will create vhost devices in the kernel which perform network, SCSI, etc IO and management operations from worker threads created by the kthread API. Because the kthread API does a copy_process on the kthreadd thread, the vhost layer has to use kthread_use_mm to access the Qemu thread's memory and cgroup_attach_task_all to add itself to the Qemu thread's cgroups, and it bypasses the RLIMIT_NPROC limit. This patch adds a new struct vhost_task which can be used instead of kthreads. They allow the vhost layer to use copy_process and inherit the userspace process's mm and cgroups and the task is accounted for under the userspace's nproc count. Signed-off-by: Mike Christie <michael.christie@xxxxxxxxxx> --- MAINTAINERS | 2 + drivers/vhost/Kconfig | 5 ++ include/linux/sched/vhost_task.h | 23 ++++++ kernel/Makefile | 1 + kernel/vhost_task.c | 123 +++++++++++++++++++++++++++++++ 5 files changed, 154 insertions(+) create mode 100644 include/linux/sched/vhost_task.h create mode 100644 kernel/vhost_task.c diff --git a/MAINTAINERS b/MAINTAINERS index 3e461db9cd91..4a3d9541e3cc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20477,7 +20477,9 @@ L: virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx L: netdev@xxxxxxxxxxxxxxx S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git +F: kernel/vhost_task.c F: drivers/vhost/ +F: include/linux/sched/vhost_task.h F: include/linux/vhost_iotlb.h F: include/uapi/linux/vhost.h diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 587fbae06182..b455d9ab6f3d 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -13,9 +13,14 @@ config VHOST_RING This option is selected by any driver which needs to access the host side of a virtio ring. +config VHOST_TASK + bool + default n + config VHOST tristate select VHOST_IOTLB + select VHOST_TASK help This option is selected by any driver which needs to access the core of vhost. diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h new file mode 100644 index 000000000000..50d02a25d37b --- /dev/null +++ b/include/linux/sched/vhost_task.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_VHOST_TASK_H +#define _LINUX_VHOST_TASK_H + +#include <linux/completion.h> + +struct task_struct; + +struct vhost_task { + int (*fn)(void *data); + void *data; + struct completion exited; + unsigned long flags; + struct task_struct *task; +}; + +struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg, int node); +__printf(2, 3) +void vhost_task_start(struct vhost_task *vtsk, const char namefmt[], ...); +void vhost_task_stop(struct vhost_task *vtsk); +bool vhost_task_should_stop(struct vhost_task *vtsk); + +#endif diff --git a/kernel/Makefile b/kernel/Makefile index 56f4ee97f328..d82f388082b8 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -15,6 +15,7 @@ obj-y = fork.o exec_domain.o panic.o \ obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o obj-$(CONFIG_MODULES) += kmod.o obj-$(CONFIG_MULTIUSER) += groups.o +obj-$(CONFIG_VHOST_TASK) += vhost_task.o ifdef CONFIG_FUNCTION_TRACER # Do not trace internal ftrace files diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c new file mode 100644 index 000000000000..4ab1c195bc76 --- /dev/null +++ b/kernel/vhost_task.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 Oracle Corporation + */ +#include <linux/slab.h> +#include <linux/completion.h> +#include <linux/sched/task.h> +#include <linux/sched/vhost_task.h> +#include <linux/sched/signal.h> + +enum vhost_task_flags { + VHOST_TASK_FLAGS_STOP, +}; + +static void vhost_task_fn(void *data) +{ + struct vhost_task *vtsk = data; + int ret; + + ret = vtsk->fn(vtsk->data); + complete(&vtsk->exited); + do_exit(ret); +} + +/** + * vhost_task_stop - stop a vhost_task + * @vtsk: vhost_task to stop + * + * Callers must call vhost_task_should_stop and return from their worker + * function when it returns true; + */ +void vhost_task_stop(struct vhost_task *vtsk) +{ + pid_t pid = vtsk->task->pid; + + set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); + wake_up_process(vtsk->task); + /* + * Make sure vhost_task_fn is no longer accessing the vhost_task before + * freeing it below. If userspace crashed or exited without closing, + * then the vhost_task->task could already be marked dead so + * kernel_wait will return early. + */ + wait_for_completion(&vtsk->exited); + /* + * If we are just closing/removing a device and the parent process is + * not exiting then reap the task. + */ + kernel_wait4(pid, NULL, __WCLONE, NULL); + kfree(vtsk); +} +EXPORT_SYMBOL_GPL(vhost_task_stop); + +/** + * vhost_task_should_stop - should the vhost task return from the work function + */ +bool vhost_task_should_stop(struct vhost_task *vtsk) +{ + return test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); +} +EXPORT_SYMBOL_GPL(vhost_task_should_stop); + +/** + * vhost_task_create - create a copy of a process to be used by the kernel + * @fn: thread stack + * @arg: data to be passed to fn + * @node: numa node to allocate task from + * + * This returns a specialized task for use by the vhost layer or NULL on + * failure. The returned task is inactive, and the caller must fire it up + * through vhost_task_start(). + */ +struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg, int node) +{ + struct kernel_clone_args args = { + .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM, + .exit_signal = 0, + .stack = (unsigned long)vhost_task_fn, + .worker_flags = USER_WORKER | USER_WORKER_NO_FILES | + USER_WORKER_SIG_IGN, + }; + struct vhost_task *vtsk; + struct task_struct *tsk; + + vtsk = kzalloc(sizeof(*vtsk), GFP_KERNEL); + if (!vtsk) + return ERR_PTR(-ENOMEM); + + init_completion(&vtsk->exited); + vtsk->data = arg; + vtsk->fn = fn; + args.stack_size = (unsigned long)vtsk; + + tsk = copy_process(NULL, 0, node, &args); + if (IS_ERR(tsk)) { + kfree(vtsk); + return NULL; + } + + vtsk->task = tsk; + + return vtsk; +} +EXPORT_SYMBOL_GPL(vhost_task_create); + +/** + * vhost_task_start - start a vhost_task created with vhost_task_create + * @vtsk: vhost_task to wake up + * @namefmt: printf-style format string for the thread name + */ +void vhost_task_start(struct vhost_task *vtsk, const char namefmt[], ...) +{ + char name[TASK_COMM_LEN]; + va_list args; + + va_start(args, namefmt); + vsnprintf(name, sizeof(name), namefmt, args); + set_task_comm(vtsk->task, name); + va_end(args); + + wake_up_new_task(vtsk->task); +} +EXPORT_SYMBOL_GPL(vhost_task_start); -- 2.25.1 _______________________________________________ Virtualization mailing list Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/virtualization