Hi Peter, On Fri, Sep 17, 2021 at 11:03:21AM -0700, Peter Oskolkov wrote: > Define struct umcg_task and two syscalls: sys_umcg_ctl sys_umcg_wait. > > All key operations, such as wait/wake/context-switch, as well as > timeouts and block/wake detection, are working quite reliably. > > In addition, the userspace can now force the kernel to preempt > a running worker by changing its state from RUNNING to > RUNNING | PREEMPTED and sending a signal to it. This new functionality > is less well tested than the key operations above, but is working > well in the common case of the worker busy in the userspace. > > These big things remain to be addressed (in no particular order): > - tracing/debugging > - faster context switches (see umcg_do_context_switch in umcg.c) > - other architectures (we will need at least arm64 in addition to amd64) > - tools/lib/umcg for userspace > - kselftests > > I'm working on finalizing libumcg and kselftests. > > See Documentation/userspace-api/umcg.[txt|rst] for API usage and > other details. > > v0.5->v0.6 changes: > - umcg_task pages are now pinned for RUNNING workers; > - waking workers now wait for the userspace to schedule them > in exit_to_user_mode_loop() instead of in sched_update_worker(); > - added umcg_clear_child to fork and execve; > - changed current->umcg_task assignments to WRITE_ONCE; > - server/worker interactions are restricted to tasks in the same mm; > > v0.4->v0.5 changes: > - handling idle workers and servers is now much simpler on the kernel > side, thanks to Thierry Delisle's suggestion: > https://lore.kernel.org/lkml/3530714d-125b-e0f5-45b2-72695e2fc4ee@xxxxxxxxxxxx/ > - minor tweaks to improve preemption handling; > > v0.3->v0.4 changes: > - removed server_tid and api_version fields from struct umcg_task; > - added timeout handling to sys_umcg_wait(); > - implemented worker preemption via signals; > - handling idle workers and servers is changed again (see umcg.rst). > > v0.2->v0.3 changes: > - the overall approach is now based on peterz@'s suggestion in > https://lore.kernel.org/patchwork/cover/1433967/ > (should I add Suggested-by?) > - new protocol for working with idle workers and servers is used, to avoid > spinning in the kernel; > - waking a UMCG task now does not require spinning. > > Signed-off-by: Peter Oskolkov <posk@xxxxxxxxxx> > --- > arch/x86/entry/syscalls/syscall_64.tbl | 2 + > fs/exec.c | 1 + > include/linux/sched.h | 56 ++ > include/linux/syscalls.h | 4 + > include/uapi/asm-generic/unistd.h | 8 +- > include/uapi/linux/umcg.h | 117 ++++ > init/Kconfig | 10 + > kernel/entry/common.c | 1 + > kernel/exit.c | 2 + > kernel/sched/Makefile | 1 + > kernel/sched/core.c | 15 +- > kernel/sched/umcg.c | 745 +++++++++++++++++++++++++ > kernel/sys_ni.c | 4 + > 13 files changed, 963 insertions(+), 3 deletions(-) > create mode 100644 include/uapi/linux/umcg.h > create mode 100644 kernel/sched/umcg.c > > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > index ce18119ea0d0..0c6c7fd72b0b 100644 > --- a/arch/x86/entry/syscalls/syscall_64.tbl > +++ b/arch/x86/entry/syscalls/syscall_64.tbl > @@ -368,6 +368,8 @@ > 444 common landlock_create_ruleset sys_landlock_create_ruleset > 445 common landlock_add_rule sys_landlock_add_rule > 446 common landlock_restrict_self sys_landlock_restrict_self > +447 common umcg_ctl sys_umcg_ctl > +448 common umcg_wait sys_umcg_wait > > # > # Due to a historical design error, certain syscalls are numbered differently > diff --git a/fs/exec.c b/fs/exec.c > index 18594f11c31f..d652ef8017b2 100644 > --- a/fs/exec.c > +++ b/fs/exec.c > @@ -1835,6 +1835,7 @@ static int bprm_execve(struct linux_binprm *bprm, > current->fs->in_exec = 0; > current->in_execve = 0; > rseq_execve(current); > + umcg_execve(current); > acct_update_integrals(current); > task_numa_free(current, false); > return retval; > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 549018e46801..4cf9070d1361 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -66,6 +66,7 @@ struct sighand_struct; > struct signal_struct; > struct task_delay_info; > struct task_group; > +struct umcg_task; > > /* > * Task state bitmask. NOTE! These bits are also > @@ -1230,6 +1231,12 @@ struct task_struct { > unsigned long rseq_event_mask; > #endif > > +#ifdef CONFIG_UMCG > + struct umcg_task __user *umcg_task; > + struct page *pinned_umcg_worker_page; /* self */ > + struct page *pinned_umcg_server_page; > +#endif > + > struct tlbflush_unmap_batch tlb_ubc; > > union { > @@ -1606,6 +1613,7 @@ extern struct pid *cad_pid; > #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ > #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ > #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ > +#define PF_UMCG_WORKER 0x01000000 /* UMCG worker */ > #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ > #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ > #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ > @@ -2191,6 +2199,54 @@ static inline void rseq_execve(struct task_struct *t) > > #endif > > +#ifdef CONFIG_UMCG > + > +void umcg_handle_resuming_worker(void); > +void umcg_handle_exiting_worker(void); > +void umcg_clear_child(struct task_struct *tsk); > + > +/* Called by bprm_execve() in fs/exec.c. */ > +static inline void umcg_execve(struct task_struct *tsk) > +{ > + if (tsk->umcg_task) > + umcg_clear_child(tsk); > +} > + > +/* Called by exit_to_user_mode_loop() in kernel/entry/common.c.*/ > +static inline void umcg_handle_notify_resume(void) > +{ > + if (current->flags & PF_UMCG_WORKER) > + umcg_handle_resuming_worker(); > +} > + > +/* Called by do_exit() in kernel/exit.c. */ > +static inline void umcg_handle_exit(void) > +{ > + if (current->flags & PF_UMCG_WORKER) > + umcg_handle_exiting_worker(); > +} > + > +/* > + * umcg_wq_worker_[sleeping|running] are called in core.c by > + * sched_submit_work() and sched_update_worker(). > + */ > +void umcg_wq_worker_sleeping(struct task_struct *tsk); > +void umcg_wq_worker_running(struct task_struct *tsk); > + > +#else > + > +static inline void umcg_execve(struct task_struct *tsk) > +{ > +} > +static inline void umcg_handle_notify_resume(void) > +{ > +} > +static inline void umcg_handle_exit(void) > +{ > +} > + > +#endif > + > #ifdef CONFIG_DEBUG_RSEQ > > void rseq_syscall(struct pt_regs *regs); > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > index 050511e8f1f8..f3e1ef8d842f 100644 > --- a/include/linux/syscalls.h > +++ b/include/linux/syscalls.h > @@ -71,6 +71,7 @@ struct open_how; > struct mount_attr; > struct landlock_ruleset_attr; > enum landlock_rule_type; > +struct umcg_task; > > #include <linux/types.h> > #include <linux/aio_abi.h> > @@ -1050,6 +1051,9 @@ asmlinkage long sys_landlock_create_ruleset(const struct landlock_ruleset_attr _ > asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type rule_type, > const void __user *rule_attr, __u32 flags); > asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags); > +asmlinkage long sys_umcg_ctl(u32 flags, struct umcg_task __user *self); > +asmlinkage long sys_umcg_wait(u32 flags, u64 abs_timeout); > + > > /* > * Architecture-specific system calls > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h > index 6de5a7fc066b..1a4c9ac0e296 100644 > --- a/include/uapi/asm-generic/unistd.h > +++ b/include/uapi/asm-generic/unistd.h > @@ -873,8 +873,14 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) > #define __NR_landlock_restrict_self 446 > __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) > > +#define __NR_umcg_ctl 447 > +__SYSCALL(__NR_umcg_ctl, sys_umcg_ctl) > +#define __NR_umcg_wait 448 > +__SYSCALL(__NR_umcg_wait, sys_umcg_wait) > + > + > #undef __NR_syscalls > -#define __NR_syscalls 447 > +#define __NR_syscalls 449 > > /* > * 32 bit systems traditionally used different > diff --git a/include/uapi/linux/umcg.h b/include/uapi/linux/umcg.h > new file mode 100644 > index 000000000000..edce804781f9 > --- /dev/null > +++ b/include/uapi/linux/umcg.h > @@ -0,0 +1,117 @@ > +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ > +#ifndef _UAPI_LINUX_UMCG_H > +#define _UAPI_LINUX_UMCG_H > + > +#include <linux/limits.h> > +#include <linux/types.h> > + > +/* > + * UMCG: User Managed Concurrency Groups. > + * > + * Syscalls (see kernel/sched/umcg.c): > + * sys_umcg_ctl() - register/unregister UMCG tasks; > + * sys_umcg_wait() - wait/wake/context-switch. > + * > + * struct umcg_task (below): controls the state of UMCG tasks. > + * > + * See Documentation/userspace-api/umcg.[txt|rst] for detals. > + */ > + > +/* > + * UMCG task states, the first 8 bits. The states represent the user space > + * point of view. > + */ > +#define UMCG_TASK_NONE 0 > +#define UMCG_TASK_RUNNING 1 > +#define UMCG_TASK_IDLE 2 > +#define UMCG_TASK_BLOCKED 3 > + > +/* The first byte: RUNNING, IDLE, or BLOCKED. */ > +#define UMCG_TASK_STATE_MASK 0xff > + > +/* UMCG task state flags, bits 8-15 */ > + > +/* > + * UMCG_TF_LOCKED: locked by the userspace in preparation to calling umcg_wait. > + */ > +#define UMCG_TF_LOCKED (1 << 8) > + > +/* > + * UMCG_TF_PREEMPTED: the userspace indicates the worker should be preempted. > + */ > +#define UMCG_TF_PREEMPTED (1 << 9) > + > +/** > + * struct umcg_task - controls the state of UMCG tasks. > + * > + * The struct is aligned at 64 bytes to ensure that it fits into > + * a single cache line. > + */ > +struct umcg_task { > + /** > + * @state: the current state of the UMCG task described by this struct. > + * > + * Readable/writable by both the kernel and the userspace. > + * > + * UMCG task state: > + * bits 0 - 7: task state; > + * bits 8 - 15: state flags; > + * bits 16 - 23: reserved; must be zeroes; > + * bits 24 - 31: for userspace use. > + */ > + uint32_t state; /* r/w */ > + > + /** > + * @next_tid: the TID of the UMCG task that should be context-switched > + * into in sys_umcg_wait(). Can be zero. > + * > + * Running UMCG workers must have next_tid set to point to IDLE > + * UMCG servers. > + * > + * Read-only for the kernel, read/write for the userspace. > + */ > + uint32_t next_tid; /* r */ > + > + /** > + * @idle_workers_ptr: a single-linked list of idle workers. Can be NULL. > + * > + * Readable/writable by both the kernel and the userspace: the > + * kernel adds items to the list, the userspace removes them. > + */ > + uint64_t idle_workers_ptr; /* r/w */ > + > + /** > + * @idle_server_tid_ptr: a pointer pointing to a single idle server. > + * Readonly. > + */ > + uint64_t idle_server_tid_ptr; /* r */ > +} __attribute__((packed, aligned(8 * sizeof(__u64)))); > + > +/** > + * enum umcg_ctl_flag - flags to pass to sys_umcg_ctl > + * @UMCG_CTL_REGISTER: register the current task as a UMCG task > + * @UMCG_CTL_UNREGISTER: unregister the current task as a UMCG task > + * @UMCG_CTL_WORKER: register the current task as a UMCG worker > + */ > +enum umcg_ctl_flag { > + UMCG_CTL_REGISTER = 0x00001, > + UMCG_CTL_UNREGISTER = 0x00002, > + UMCG_CTL_WORKER = 0x10000, > +}; > + > +/** > + * enum umcg_wait_flag - flags to pass to sys_umcg_wait > + * @UMCG_WAIT_WAKE_ONLY: wake @self->next_tid, don't put @self to sleep; > + * @UMCG_WAIT_WF_CURRENT_CPU: wake @self->next_tid on the current CPU > + * (use WF_CURRENT_CPU); @UMCG_WAIT_WAKE_ONLY > + * must be set. > + */ > +enum umcg_wait_flag { > + UMCG_WAIT_WAKE_ONLY = 1, > + UMCG_WAIT_WF_CURRENT_CPU = 2, > +}; > + > +/* See Documentation/userspace-api/umcg.[txt|rst].*/ > +#define UMCG_IDLE_NODE_PENDING (1ULL) > + > +#endif /* _UAPI_LINUX_UMCG_H */ > diff --git a/init/Kconfig b/init/Kconfig > index a61c92066c2e..c15a50a61ba6 100644 > --- a/init/Kconfig > +++ b/init/Kconfig > @@ -1662,6 +1662,16 @@ config MEMBARRIER > > If unsure, say Y. > > +config UMCG > + bool "Enable User Managed Concurrency Groups API" > + depends on X86_64 > + default n > + help > + Enable User Managed Concurrency Groups API, which form the basis > + for an in-process M:N userspace scheduling framework. > + At the moment this is an experimental/RFC feature that is not > + guaranteed to be backward-compatible. > + > config KALLSYMS > bool "Load all symbols for debugging/ksymoops" if EXPERT > default y > diff --git a/kernel/entry/common.c b/kernel/entry/common.c > index bf16395b9e13..f3cd335ab513 100644 > --- a/kernel/entry/common.c > +++ b/kernel/entry/common.c > @@ -173,6 +173,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, > > if (ti_work & _TIF_NOTIFY_RESUME) { > tracehook_notify_resume(regs); > + umcg_handle_notify_resume(); /* might sleep */ > rseq_handle_notify_resume(NULL, regs); > } > > diff --git a/kernel/exit.c b/kernel/exit.c > index fd1c04193e18..fdd4e923cca9 100644 > --- a/kernel/exit.c > +++ b/kernel/exit.c > @@ -744,6 +744,8 @@ void __noreturn do_exit(long code) > if (unlikely(!tsk->pid)) > panic("Attempted to kill the idle task!"); > > + umcg_handle_exit(); > + > /* > * If do_exit is called because this processes oopsed, it's possible > * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before > diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile > index 978fcfca5871..e4e481eee1b7 100644 > --- a/kernel/sched/Makefile > +++ b/kernel/sched/Makefile > @@ -37,3 +37,4 @@ obj-$(CONFIG_MEMBARRIER) += membarrier.o > obj-$(CONFIG_CPU_ISOLATION) += isolation.o > obj-$(CONFIG_PSI) += psi.o > obj-$(CONFIG_SCHED_CORE) += core_sched.o > +obj-$(CONFIG_UMCG) += umcg.o > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index 12a9d053e724..c9133cf153b9 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -4159,6 +4159,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) > p->wake_entry.u_flags = CSD_TYPE_TTWU; > p->migration_pending = NULL; > #endif > +#ifdef CONFIG_UMCG > + umcg_clear_child(p); > +#endif > } > > DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); > @@ -6105,10 +6108,14 @@ static inline void sched_submit_work(struct task_struct *tsk) > * in the possible wakeup of a kworker and because wq_worker_sleeping() > * requires it. > */ > - if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { > + if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_UMCG_WORKER)) { > preempt_disable(); > if (task_flags & PF_WQ_WORKER) > wq_worker_sleeping(tsk); > +#ifdef CONFIG_UMCG > + else if (task_flags & PF_UMCG_WORKER) > + umcg_wq_worker_sleeping(tsk); > +#endif > else > io_wq_worker_sleeping(tsk); > preempt_enable_no_resched(); > @@ -6127,9 +6134,13 @@ static inline void sched_submit_work(struct task_struct *tsk) > > static void sched_update_worker(struct task_struct *tsk) > { > - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { > + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_UMCG_WORKER)) { > if (tsk->flags & PF_WQ_WORKER) > wq_worker_running(tsk); > +#ifdef CONFIG_UMCG > + else if (tsk->flags & PF_UMCG_WORKER) > + umcg_wq_worker_running(tsk); > +#endif > else > io_wq_worker_running(tsk); > } > diff --git a/kernel/sched/umcg.c b/kernel/sched/umcg.c > new file mode 100644 > index 000000000000..aa4dbb31c425 > --- /dev/null > +++ b/kernel/sched/umcg.c > @@ -0,0 +1,745 @@ > +// SPDX-License-Identifier: GPL-2.0-only > + > +/* > + * User Managed Concurrency Groups (UMCG). > + * > + * See Documentation/userspace-api/umcg.[txt|rst] for detals. > + */ > + > +#include <linux/syscalls.h> > +#include <linux/types.h> > +#include <linux/uaccess.h> > +#include <linux/umcg.h> > + > +#include "sched.h" > +#include "umcg_uaccess.h" > + > +/** > + * umcg_pin_pages: pin pages containing struct umcg_task of this worker > + * and its server. > + * > + * The pages are pinned when the worker exits to the userspace and unpinned > + * when the worker is in sched_submit_work(), i.e. when the worker is > + * about to be removed from its runqueue. Thus at most NR_CPUS UMCG pages > + * are pinned at any one time across the whole system. > + */ > +static int umcg_pin_pages(u32 server_tid) > +{ > + struct umcg_task __user *worker_ut = current->umcg_task; > + struct umcg_task __user *server_ut = NULL; > + struct task_struct *tsk; > + > + rcu_read_lock(); > + tsk = find_task_by_vpid(server_tid); > + if (tsk) > + server_ut = READ_ONCE(tsk->umcg_task); > + rcu_read_unlock(); > + > + if (!server_ut) > + return -EINVAL; > + > + if (READ_ONCE(current->mm) != READ_ONCE(tsk->mm)) > + return -EINVAL; > + > + tsk = current; > + > + /* worker_ut is stable, don't need to repin */ > + if (!tsk->pinned_umcg_worker_page) > + if (1 != pin_user_pages_fast((unsigned long)worker_ut, 1, 0, > + &tsk->pinned_umcg_worker_page)) > + return -EFAULT; > + > + /* server_ut may change, need to repin */ > + if (tsk->pinned_umcg_server_page) { > + unpin_user_page(tsk->pinned_umcg_server_page); > + tsk->pinned_umcg_server_page = NULL; > + } > + > + if (1 != pin_user_pages_fast((unsigned long)server_ut, 1, 0, > + &tsk->pinned_umcg_server_page)) > + return -EFAULT; > + > + return 0; > +} > + > +static void umcg_unpin_pages(void) > +{ > + struct task_struct *tsk = current; > + > + if (tsk->pinned_umcg_worker_page) > + unpin_user_page(tsk->pinned_umcg_worker_page); > + if (tsk->pinned_umcg_server_page) > + unpin_user_page(tsk->pinned_umcg_server_page); > + > + tsk->pinned_umcg_worker_page = NULL; > + tsk->pinned_umcg_server_page = NULL; > +} > + > +static void umcg_clear_task(struct task_struct *tsk) > +{ > + /* > + * This is either called for the current task, or for a newly forked > + * task that is not yet running, so we don't need strict atomicity > + * below. > + */ > + if (tsk->umcg_task) { > + WRITE_ONCE(tsk->umcg_task, NULL); > + > + /* These can be simple writes - see the commment above. */ > + tsk->pinned_umcg_worker_page = NULL; > + tsk->pinned_umcg_server_page = NULL; > + tsk->flags &= ~PF_UMCG_WORKER; > + } > +} > + > +/* Called for a forked or execve-ed child. */ > +void umcg_clear_child(struct task_struct *tsk) > +{ > + umcg_clear_task(tsk); > +} > + > +/* Called both by normally (unregister) and abnormally exiting workers. */ > +void umcg_handle_exiting_worker(void) > +{ > + umcg_unpin_pages(); > + umcg_clear_task(current); > +} > + > +/** > + * sys_umcg_ctl: (un)register the current task as a UMCG task. > + * @flags: ORed values from enum umcg_ctl_flag; see below; > + * @self: a pointer to struct umcg_task that describes this > + * task and governs the behavior of sys_umcg_wait if > + * registering; must be NULL if unregistering. > + * > + * @flags & UMCG_CTL_REGISTER: register a UMCG task: > + * UMCG workers: > + * - self->state must be UMCG_TASK_IDLE The code below checks UMCG_TASK_BLOCK, but the comment says UMCG_TASK_IDLE. Doc says: '+When registering a worker, self->state must be BLOCKED;' So, the comment here needs to be modified. > + * - @flags & UMCG_CTL_WORKER > + * UMCG servers: > + * - self->state must be UMCG_TASK_RUNNING > + * - !(@flags & UMCG_CTL_WORKER) > + * > + * All tasks: > + * - self->next_tid must be zero > + * > + * If the conditions above are met, sys_umcg_ctl() immediately returns > + * if the registered task is a server; a worker will be added to > + * idle_workers_ptr, and the worker put to sleep; an idle server > + * from idle_server_tid_ptr will be woken, if present. > + * > + * @flags == UMCG_CTL_UNREGISTER: unregister a UMCG task. If the current task > + * is a UMCG worker, the userspace is responsible for waking its > + * server (before or after calling sys_umcg_ctl). > + * > + * Return: > + * 0 - success > + * -EFAULT - failed to read @self > + * -EINVAL - some other error occurred > + */ > +SYSCALL_DEFINE2(umcg_ctl, u32, flags, struct umcg_task __user *, self) > +{ > + struct umcg_task ut; > + > + if (flags == UMCG_CTL_UNREGISTER) { > + if (self || !current->umcg_task) > + return -EINVAL; > + > + if (current->flags & PF_UMCG_WORKER) > + umcg_handle_exiting_worker(); > + else > + umcg_clear_task(current); > + > + return 0; > + } > + > + /* Register the current task as a UMCG task. */ > + if (!(flags & UMCG_CTL_REGISTER)) > + return -EINVAL; > + > + flags &= ~UMCG_CTL_REGISTER; > + if (flags && flags != UMCG_CTL_WORKER) > + return -EINVAL; > + > + if (current->umcg_task || !self) > + return -EINVAL; > + > + if (copy_from_user(&ut, self, sizeof(ut))) > + return -EFAULT; > + > + if (ut.next_tid) > + return -EINVAL; > + > + if (flags == UMCG_CTL_WORKER) { > + if (ut.state != UMCG_TASK_BLOCKED) > + return -EINVAL; Here is where the code check UMCG_TASK_BLOCKED. > + > + WRITE_ONCE(current->umcg_task, self); > + current->flags |= PF_UMCG_WORKER; > + > + set_tsk_need_resched(current); > + return 0; > + } > + > + /* This is a server task. */ > + if (ut.state != UMCG_TASK_RUNNING) > + return -EINVAL; > + > + WRITE_ONCE(current->umcg_task, self); > + return 0; > +} > + > +/** > + * handle_timedout_worker - make sure the worker is added to idle_workers > + * upon a "clean" timeout. > + */ > +static int handle_timedout_worker(struct umcg_task __user *self) > +{ > + u32 prev_state, next_state; > + int ret; > + > + if (get_user(prev_state, &self->state)) > + return -EFAULT; > + > + if ((prev_state & UMCG_TASK_STATE_MASK) == UMCG_TASK_IDLE) { > + /* TODO: should we care here about TF_LOCKED or TF_PREEMPTED? */ > + > + next_state = prev_state & ~UMCG_TASK_STATE_MASK; > + next_state |= UMCG_TASK_BLOCKED; > + > + ret = cmpxchg_user_32(&self->state, &prev_state, next_state); > + if (ret) > + return ret; > + > + return -ETIMEDOUT; > + } > + > + return 0; /* Not really timed out. */ > +} > + > +/** > + * umcg_idle_loop - sleep until the current task becomes RUNNING or a timeout > + * @abs_timeout - absolute timeout in nanoseconds; zero => no timeout > + * > + * The function marks the current task as INTERRUPTIBLE and calls > + * schedule(). It returns when either the timeout expires or > + * the UMCG state of the task becomes RUNNING. > + * > + * Note: because UMCG workers should not be running WITHOUT attached servers, > + * and because servers should not be running WITH attached workers, > + * the function returns only on fatal signal pending and ignores/flushes > + * all other signals. > + */ > +static int umcg_idle_loop(u64 abs_timeout) > +{ > + int ret; > + struct hrtimer_sleeper timeout; > + struct umcg_task __user *self = current->umcg_task; > + > + if (abs_timeout) { > + hrtimer_init_sleeper_on_stack(&timeout, CLOCK_REALTIME, > + HRTIMER_MODE_ABS); > + > + hrtimer_set_expires_range_ns(&timeout.timer, (s64)abs_timeout, > + current->timer_slack_ns); > + } > + > + while (true) { > + u32 umcg_state; > + > + set_current_state(TASK_INTERRUPTIBLE); > + > + smp_mb(); /* Order with set_current_state() above. */ set_current_state() implies the smp_mb(). I don't think about it just feel it should be put above the set_current_state() to order with __set_current_state() below. > + ret = -EFAULT; > + if (get_user(umcg_state, &self->state)) { > + set_current_state(TASK_RUNNING); > + goto out; > + } > + > + ret = 0; > + if ((umcg_state & UMCG_TASK_STATE_MASK) == UMCG_TASK_RUNNING) { > + set_current_state(TASK_RUNNING); > + goto out; > + } > + > + if (abs_timeout) > + hrtimer_sleeper_start_expires(&timeout, HRTIMER_MODE_ABS); > + > + if (!abs_timeout || timeout.task) { > + /* > + * Clear PF_UMCG_WORKER to elide workqueue handlers. > + */ > + const bool worker = current->flags & PF_UMCG_WORKER; > + > + if (worker) > + current->flags &= ~PF_UMCG_WORKER; > + > + /* > + * Note: freezable_schedule() here is not appropriate > + * as umcg_idle_loop can be called from rwsem locking > + * context (via workqueue handlers), which may > + * trigger a lockdep warning for mmap_lock. > + */ > + schedule(); > + > + if (worker) > + current->flags |= PF_UMCG_WORKER; > + } > + __set_current_state(TASK_RUNNING); > + > + /* > + * Check for timeout before checking the state, as workers > + * are not going to return from schedule() unless > + * they are RUNNING. > + */ > + ret = -ETIMEDOUT; > + if (abs_timeout && !timeout.task) > + goto out; > + > + ret = -EFAULT; > + if (get_user(umcg_state, &self->state)) > + goto out; > + > + ret = 0; > + if ((umcg_state & UMCG_TASK_STATE_MASK) == UMCG_TASK_RUNNING) > + goto out; > + > + ret = -EINTR; > + if (fatal_signal_pending(current)) > + goto out; > + > + if (signal_pending(current)) > + flush_signals(current); > + } > + > +out: > + if (abs_timeout) { > + hrtimer_cancel(&timeout.timer); > + destroy_hrtimer_on_stack(&timeout.timer); > + } > + > + /* Workers must go through workqueue handlers upon wakeup. */ > + if (current->flags & PF_UMCG_WORKER) { > + if (ret == -ETIMEDOUT) > + ret = handle_timedout_worker(self); > + > + set_tsk_need_resched(current); > + } > + > + return ret; > +} > + > +/* > + * Try to wake up. May be called with preempt_disable set. May be called > + * cross-process. > + * > + * Note: umcg_ttwu succeeds even if ttwu fails: see wait/wake state > + * ordering logic. > + */ > +static int umcg_ttwu(u32 next_tid, int wake_flags) > +{ > + struct task_struct *next; > + > + rcu_read_lock(); > + next = find_task_by_vpid(next_tid); > + if (!next || !(READ_ONCE(next->umcg_task))) { > + rcu_read_unlock(); > + return -ESRCH; > + } > + > + /* Note: next does not necessarily share mm with current. */ > + > + try_to_wake_up(next, TASK_NORMAL, wake_flags); /* Result ignored. */ > + rcu_read_unlock(); > + > + return 0; > +} > + > +/* > + * At the moment, umcg_do_context_switch simply wakes up @next with > + * WF_CURRENT_CPU and puts the current task to sleep. May be called cross-mm. > + * > + * In the future an optimization will be added to adjust runtime accounting > + * so that from the kernel scheduling perspective the two tasks are > + * essentially treated as one. In addition, the context switch may be performed > + * right here on the fast path, instead of going through the wake/wait pair. > + */ > +static int umcg_do_context_switch(u32 next_tid, u64 abs_timeout) > +{ > + struct task_struct *next; > + > + rcu_read_lock(); > + next = find_task_by_vpid(next_tid); > + if (!next) { > + rcu_read_unlock(); > + return -ESRCH; > + } > + > + /* Note: next does not necessarily share mm with current. */ > + > + /* TODO: instead of wake + sleep, do a context switch. */ > + try_to_wake_up(next, TASK_NORMAL, WF_CURRENT_CPU); /* Result ignored. */ > + rcu_read_unlock(); > + > + return umcg_idle_loop(abs_timeout); > +} > + > +/** > + * sys_umcg_wait: put the current task to sleep and/or wake another task. > + * @flags: zero or a value from enum umcg_wait_flag. > + * @abs_timeout: when to wake the task, in nanoseconds; zero for no timeout. > + * > + * @self->state must be UMCG_TASK_IDLE (where @self is current->umcg_task) > + * if !(@flags & UMCG_WAIT_WAKE_ONLY). > + * > + * If @self->next_tid is not zero, it must point to an IDLE UMCG task. > + * The userspace must have changed its state from IDLE to RUNNING > + * before calling sys_umcg_wait() in the current task. This "next" > + * task will be woken (context-switched-to on the fast path) when the > + * current task is put to sleep. > + * > + * See Documentation/userspace-api/umcg.[txt|rst] for detals. > + * > + * Return: > + * 0 - OK; > + * -ETIMEDOUT - the timeout expired; > + * -EFAULT - failed accessing struct umcg_task __user of the current > + * task; > + * -ESRCH - the task to wake not found or not a UMCG task; > + * -EINVAL - another error happened (e.g. bad @flags, or the current > + * task is not a UMCG task, etc.) > + */ > +SYSCALL_DEFINE2(umcg_wait, u32, flags, u64, abs_timeout) > +{ > + struct umcg_task __user *self = current->umcg_task; > + u32 next_tid; > + > + if (!self) > + return -EINVAL; > + > + if (get_user(next_tid, &self->next_tid)) > + return -EFAULT; > + > + if (flags & UMCG_WAIT_WAKE_ONLY) { > + if (!next_tid || abs_timeout) > + return -EINVAL; > + > + flags &= ~UMCG_WAIT_WAKE_ONLY; > + if (flags & ~UMCG_WAIT_WF_CURRENT_CPU) > + return -EINVAL; > + > + return umcg_ttwu(next_tid, flags & UMCG_WAIT_WF_CURRENT_CPU ? > + WF_CURRENT_CPU : 0); > + } > + > + /* Unlock the worker, if locked. */ > + if (current->flags & PF_UMCG_WORKER) { > + u32 umcg_state; > + > + if (get_user(umcg_state, &self->state)) > + return -EFAULT; > + > + if ((umcg_state & UMCG_TF_LOCKED) && cmpxchg_user_32( > + &self->state, &umcg_state, > + umcg_state & ~UMCG_TF_LOCKED)) > + return -EFAULT; > + } > + > + if (next_tid) > + return umcg_do_context_switch(next_tid, abs_timeout); > + > + return umcg_idle_loop(abs_timeout); > +} > + > +/* > + * NOTE: all code below is called from workqueue submit/update, or > + * syscall exit to usermode loop, so all errors result in the > + * termination of the current task (via SIGKILL). > + */ > + > +/* Returns true on success, false on _any_ error. */ > +static bool mark_server_running(u32 server_tid, bool may_sleep) > +{ > + struct umcg_task __user *ut_server = NULL; > + u32 state = UMCG_TASK_IDLE; > + struct task_struct *tsk; > + > + rcu_read_lock(); > + tsk = find_task_by_vpid(server_tid); > + if (tsk) > + ut_server = READ_ONCE(tsk->umcg_task); > + rcu_read_unlock(); > + > + if (!ut_server) > + return false; > + > + if (READ_ONCE(current->mm) != READ_ONCE(tsk->mm)) > + return false; > + > + if (may_sleep) > + return !cmpxchg_user_32(&ut_server->state, &state, UMCG_TASK_RUNNING); > + > + return !cmpxchg_user_32_nosleep(&ut_server->state, &state, UMCG_TASK_RUNNING); > +} > + > +/* > + * Called by sched_submit_work() for UMCG workers from within preempt_disable() > + * context. In the common case, the worker's state changes RUNNING => BLOCKED, > + * and its server's state changes IDLE => RUNNING, and the server is ttwu-ed. > + * > + * Under some conditions (e.g. the worker is "locked", see > + * /Documentation/userspace-api/umcg.[txt|rst] for more details), the > + * function does nothing. > + */ > +static void __umcg_wq_worker_sleeping(struct task_struct *tsk) > +{ > + struct umcg_task __user *ut_worker = tsk->umcg_task; > + u32 prev_state, next_state, server_tid; > + bool preempted = false; > + int ret; > + > + if (WARN_ONCE((tsk != current) || !ut_worker, "Invalid umcg worker")) > + return; > + > + /* Sometimes "locked" workers run without servers. */ > + if (unlikely(!tsk->pinned_umcg_server_page)) > + return; > + > + smp_mb(); /* The userspace may change the state concurrently. */ > + if (get_user_nosleep(prev_state, &ut_worker->state)) > + goto die; /* EFAULT */ > + > + if (prev_state & UMCG_TF_LOCKED) > + return; > + > + if ((prev_state & UMCG_TASK_STATE_MASK) != UMCG_TASK_RUNNING) > + return; /* the worker is in umcg_wait */ > + > +retry_once: > + next_state = prev_state & ~UMCG_TASK_STATE_MASK; > + next_state |= UMCG_TASK_BLOCKED; > + preempted = prev_state & UMCG_TF_PREEMPTED; > + > + ret = cmpxchg_user_32_nosleep(&ut_worker->state, &prev_state, next_state); > + if (ret == -EAGAIN) { > + if (preempted) > + goto die; /* Preemption can only happen once. */ > + > + if (prev_state != (UMCG_TASK_RUNNING | UMCG_TF_PREEMPTED)) > + goto die; /* Only preemption can happen. */ > + > + preempted = true; > + goto retry_once; > + } > + if (ret) > + goto die; /* EFAULT */ > + > + if (get_user_nosleep(server_tid, &ut_worker->next_tid)) > + goto die; /* EFAULT */ > + > + if (!server_tid) > + return; /* Waking a waiting worker leads here. */ > + > + /* The idle server's wait may timeout. */ > + /* TODO: make a smarter context switch below when available. */ > + if (mark_server_running(server_tid, false)) > + umcg_ttwu(server_tid, WF_CURRENT_CPU); > + > + return; > + > +die: > + pr_warn("umcg_wq_worker_sleeping: killing task %d\n", current->pid); > + force_sig(SIGKILL); > +} > + > +/* Called from sched_submit_work() with preempt_disable. */ > +void umcg_wq_worker_sleeping(struct task_struct *tsk) > +{ > + __umcg_wq_worker_sleeping(tsk); > + umcg_unpin_pages(); > +} > + > +/** > + * enqueue_idle_worker - push an idle worker onto idle_workers_ptr list/stack. This function attract idot like me enough. > + * Returns true on success, false on a fatal failure. > + * > + * See Documentation/userspace-api/umcg.[txt|rst] for details. > + */ > +static bool enqueue_idle_worker(struct umcg_task __user *ut_worker) > +{ > + u64 __user *node = &ut_worker->idle_workers_ptr; > + u64 __user *head_ptr; > + u64 first = (u64)node; > + u64 head; > + > + if (get_user(head, node) || !head) > + return false; > + > + head_ptr = (u64 __user *)head; > + > + if (put_user(UMCG_IDLE_NODE_PENDING, node)) > + return false; > + > + if (xchg_user_64(head_ptr, &first)) > + return false; > + > + if (put_user(first, node)) > + return false; > + > + return true; > +} > + > +/** > + * get_idle_server - retrieve an idle server, if present. > + * > + * Returns true on success, false on a fatal failure. > + */ > +static bool get_idle_server(struct umcg_task __user *ut_worker, u32 *server_tid) > +{ > + u64 server_tid_ptr; > + u32 tid; > + int ret; > + > + *server_tid = 0; /* Empty result is OK. */ > + > + if (get_user(server_tid_ptr, &ut_worker->idle_server_tid_ptr)) > + return false; > + > + if (!server_tid_ptr) > + return false; > + > + tid = 0; > + ret = xchg_user_32((u32 __user *)server_tid_ptr, &tid); > + > + if (ret) > + return false; > + > + if (tid && mark_server_running(tid, true)) > + *server_tid = tid; > + > + return true; > +} > + > +/* > + * Returns true to wait for the userspace to schedule this worker, false > + * to return to the userspace. In the common case, enqueues the worker > + * to idle_workers_ptr list and wakes the idle server (if present). > + */ > +static bool process_waking_worker(struct task_struct *tsk, u32 *server_tid) > +{ > + struct umcg_task __user *ut_worker = tsk->umcg_task; > + u32 prev_state, next_state; > + int ret = 0; > + > + *server_tid = 0; > + > + if (WARN_ONCE((tsk != current) || !ut_worker, "Invalid umcg worker")) > + return false; > + > + if (fatal_signal_pending(tsk)) > + return false; > + > + smp_mb(); /* The userspace may concurrently modify the worker's state. */ > + if (get_user(prev_state, &ut_worker->state)) > + goto die; > + > + if ((prev_state & UMCG_TASK_STATE_MASK) == UMCG_TASK_RUNNING) { > + u32 tid; > + > + if (prev_state & UMCG_TF_LOCKED) > + return true; /* Wakeup: wait but don't enqueue. */ > + > + smp_mb(); /* Order getting state and getting server_tid */ > + > + if (get_user(tid, &ut_worker->next_tid)) > + goto die; > + > + *server_tid = tid; > + > + if (prev_state & UMCG_TF_PREEMPTED) { > + if (!tid) > + goto die; /* PREEMPTED workers must have a server. */ > + > + /* Always enqueue preempted workers. */ > + if (!mark_server_running(tid, true)) > + goto die; > + } else if (tid) > + return false; /* pass-through: RUNNING with a server. */ > + > + /* If !PREEMPTED, the worker gets here via UMCG_WAIT_WAKE_ONLY */ > + } else if (unlikely((prev_state & UMCG_TASK_STATE_MASK) == UMCG_TASK_IDLE && > + (prev_state & UMCG_TF_LOCKED))) > + return false; /* The worker prepares to sleep or to unregister. */ > + > + if ((prev_state & UMCG_TASK_STATE_MASK) == UMCG_TASK_IDLE) > + return true; /* the worker called umcg_wait(); don't enqueue */ > + > + next_state = prev_state & ~UMCG_TASK_STATE_MASK; > + next_state |= UMCG_TASK_IDLE; > + > + if (prev_state != next_state) > + ret = cmpxchg_user_32(&ut_worker->state, &prev_state, next_state); > + if (ret) > + goto die; > + > + if (!enqueue_idle_worker(ut_worker)) > + goto die; > + > + smp_mb(); /* Order enqueuing the worker with getting the server. */ > + if (!(*server_tid) && !get_idle_server(ut_worker, server_tid)) > + goto die; > + > + return true; > + > +die: > + pr_warn("umcg_process_waking_worker: killing task %d\n", current->pid); > + force_sig(SIGKILL); > + return false; > +} > + > +/* > + * Called from sched_update_worker(): defer all work until later, as > + * sched_update_worker() may be called with in-kernel locks held. > + */ > +void umcg_wq_worker_running(struct task_struct *tsk) > +{ > + set_tsk_thread_flag(tsk, TIF_NOTIFY_RESUME); > +} > + > +/* Called via TIF_NOTIFY_RESUME flag from exit_to_user_mode_loop. */ > +void umcg_handle_resuming_worker(void) > +{ > + u32 server_tid; > + > + /* Avoid recursion by removing PF_UMCG_WORKER */ > + current->flags &= ~PF_UMCG_WORKER; > + > + do { > + bool should_wait; > + > + should_wait = process_waking_worker(current, &server_tid); > + > + if (!should_wait) > + break; > + > + if (server_tid) > + umcg_do_context_switch(server_tid, 0); > + else > + umcg_idle_loop(0); > + } while (true); > + > + if (server_tid && umcg_pin_pages(server_tid)) > + goto die; > + > + if (!server_tid) /* No server => no reason to pin pages. */ > + umcg_unpin_pages(); > + > + goto out; > + > +die: > + pr_warn("%s: killing task %d\n", __func__, current->pid); > + force_sig(SIGKILL); > +out: > + current->flags |= PF_UMCG_WORKER; > +} > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c > index 0ea8128468c3..cd1be6356e42 100644 > --- a/kernel/sys_ni.c > +++ b/kernel/sys_ni.c > @@ -272,6 +272,10 @@ COND_SYSCALL(landlock_create_ruleset); > COND_SYSCALL(landlock_add_rule); > COND_SYSCALL(landlock_restrict_self); > > +/* kernel/sched/umcg.c */ > +COND_SYSCALL(umcg_ctl); > +COND_SYSCALL(umcg_wait); > + > /* arch/example/kernel/sys_example.c */ > > /* mm/fadvise.c */ > -- > 2.25.1 > Thanks, Tao