Define struct umcg_task and sys_umcg_ctl/sys_umcg_wait syscalls. This is another attempt at implementing UMCG, based on discussion in https://lore.kernel.org/patchwork/cover/1433967/ Most of the "why" is covered here (some details are obsolete): https://lore.kernel.org/patchwork/cover/1433967/#1632328 I'll update this commit message with more "why" when the general approach is ACKed at a high level. In this patch I used the approach suggested by peterz@ (should I add a Suggested-by: tag?) in the discussion linked above; specifically, only a single struct umcg_task __user *umcg_task pointer is added to struct task_struct. Comments in include/uapi/linux/umcg.h and kernel/sched/umcg.c provide many details on how UMCG syscalls are to be used. What is NOT implemented yet: - timeouts; - preemption. All the basics (wait/wake/swap, block/wake detection) seem to be working. v0.2->v0.3 changes: - new protocol for working with idle workers and servers is used, to avoid spinning in the kernel; - waking a UMCG task now does not require spinning. Signed-off-by: Peter Oskolkov <posk@xxxxxxxxxx> --- arch/x86/entry/syscalls/syscall_64.tbl | 2 + include/linux/sched.h | 6 + include/linux/syscalls.h | 4 + include/uapi/asm-generic/unistd.h | 8 +- include/uapi/linux/umcg.h | 259 +++++++++++++ init/Kconfig | 10 + kernel/exit.c | 7 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 17 +- kernel/sched/umcg.c | 485 +++++++++++++++++++++++++ kernel/sched/umcg.h | 13 + kernel/sys_ni.c | 4 + 12 files changed, 813 insertions(+), 3 deletions(-) create mode 100644 include/uapi/linux/umcg.h create mode 100644 kernel/sched/umcg.c diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index ce18119ea0d0..0c6c7fd72b0b 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -368,6 +368,8 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +447 common umcg_ctl sys_umcg_ctl +448 common umcg_wait sys_umcg_wait # # Due to a historical design error, certain syscalls are numbered differently diff --git a/include/linux/sched.h b/include/linux/sched.h index 50db9496c99d..185ad1cdde77 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -66,6 +66,7 @@ struct sighand_struct; struct signal_struct; struct task_delay_info; struct task_group; +struct umcg_task; /* * Task state bitmask. NOTE! These bits are also @@ -1223,6 +1224,10 @@ struct task_struct { unsigned long rseq_event_mask; #endif +#ifdef CONFIG_UMCG + struct umcg_task __user *umcg_task; +#endif + struct tlbflush_unmap_batch tlb_ubc; union { @@ -1599,6 +1604,7 @@ extern struct pid *cad_pid; #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ +#define PF_UMCG_WORKER 0x01000000 /* UMCG worker */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 050511e8f1f8..f3e1ef8d842f 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -71,6 +71,7 @@ struct open_how; struct mount_attr; struct landlock_ruleset_attr; enum landlock_rule_type; +struct umcg_task; #include <linux/types.h> #include <linux/aio_abi.h> @@ -1050,6 +1051,9 @@ asmlinkage long sys_landlock_create_ruleset(const struct landlock_ruleset_attr _ asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type rule_type, const void __user *rule_attr, __u32 flags); asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags); +asmlinkage long sys_umcg_ctl(u32 flags, struct umcg_task __user *self); +asmlinkage long sys_umcg_wait(u32 flags, u64 abs_timeout); + /* * Architecture-specific system calls diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 6de5a7fc066b..1a4c9ac0e296 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -873,8 +873,14 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) #define __NR_landlock_restrict_self 446 __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) +#define __NR_umcg_ctl 447 +__SYSCALL(__NR_umcg_ctl, sys_umcg_ctl) +#define __NR_umcg_wait 448 +__SYSCALL(__NR_umcg_wait, sys_umcg_wait) + + #undef __NR_syscalls -#define __NR_syscalls 447 +#define __NR_syscalls 449 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/umcg.h b/include/uapi/linux/umcg.h new file mode 100644 index 000000000000..402974b475bf --- /dev/null +++ b/include/uapi/linux/umcg.h @@ -0,0 +1,259 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_UMCG_H +#define _UAPI_LINUX_UMCG_H + +#include <linux/limits.h> +#include <linux/types.h> + +/* + * UMCG: User Managed Concurrency Groups. + * + * Syscalls (see kernel/sched/umcg.c): + * sys_umcg_ctl() - register/unregister UMCG tasks; + * sys_umcg_wait() - wait/wake/context-switch. + * + * struct umcg_task (below): controls the state of UMCG tasks. + */ + +/* + * UMCG task states, the first 8 bits. The states represent the user space + * point of view. + * + * UMCG tasks can be either RUNNING, i.e. doing useful work, or IDLE, + * i.e. have no work assigned to them and blocked in sys_umcg_wait(). + * + * In addition, when a UMCG worker blocks in the kernel (e.g. on I/O), + * it is marked as BLOCKED; when a BLOCKED worker completes its blocking + * operation, it is marked as IDLE and added to idle_workers list (see + * struct umcg_task below), until a server "runs" it. + * + * UMCG servers continue to be considered as RUNNING even if they are blocked + * in the kernel in any way other than in sys_umcg_wait(). + * + * State transitions: + * + * RUNNING => IDLE: the current RUNNING task becomes IDLE by calling + * sys_umcg_wait(); + * IDLE => RUNNING: - another worker or server task called + * sys_umcg_wait() with self->next_tid pointing to the + * task transitioning from IDLE to RUNNING (mostly + * applies to workers and basic tasks); + * - the userspace marked and IDLE task as RUNNING and + * sent a signal to it (thus interrupting sys_umcg_wait); + * - servers: the kernel wakes an IDLE server from + * idle_servers list when a BLOCKED worker becomes IDLE + * (see below); + * - servers: the kernel wakes and IDLE server that + * is "attached" to a RUNNING worker when the worker + * becomes BLOCKED; + * RUNNING => BLOCKED: when a RUNNING UMCG worker blocks in the kernel, + * the kernel marks it as BLOCKED (and wakes its server); + * BLOCKED => IDLE: when a BLOCKED UMCG worker finishes its blocking + * operation, the kernel marks it as IDLE, adds it to + * the list of idle workers (see struct umcg_task) and + * wakes an idle server from the list of idle servers, if + * available. + * + * Note 1: only the transitions listed above are possible; these state + * transitions never happen: + * - IDLE => BLOCKED (never happen) + * - BLOCKED => RUNNING (never happen) + * + * Note 2: only UMCG workers (UMCG tasks registered with UMCG_CTL_WORKER + * flag set) are subject to block/wake detection logic; + * + * Note 3: if a worker has UMC_TF_LOCKED state flag set, it behaves as + * a server, i.e. the block/wake detection is disabled (this is a UMCG + * equivalent of task_lock() or preempt_disable()). UMCG_TF_LOCKED + * flag is cleared by the kernel when the worker goes to sleep in + * umcg_wait(). + * + * Note 4: changing the value of umcg_task.state field is the responsibility + * of the party initiating the state change: when a state transition + * is initiated by the userspace via a call to sys_umcg_wait(), it + * is the userspace's responsibility to change the value of the + * umcg_task.state field; when a state transition is initiated + * by the kernel during worker block/wake handling, it is the kernel + * who marks the worker as BLOCKED or IDLE, and the server as RUNNING. + */ +#define UMCG_TASK_NONE 0 +#define UMCG_TASK_RUNNING 1 +#define UMCG_TASK_IDLE 2 +#define UMCG_TASK_BLOCKED 3 + +#define UMCG_TF_STATE_MASK 0xff + +/* UMCG task state flags, bits 8-15 */ + +/* + * UMCG_TF_LOCKED: locked by the userspace; workers with UMCG_TF_LOCKED set + * do not become BLOCKED and do not wake their attached server. + */ +#define UMCG_TF_LOCKED (1 << 8) + +/** + * struct umcg_task - controls the state of UMCG tasks. + * + * UMCG tasks can be: + * + * - UMCG workers: must have a UMCG server assigned when running (unless + * UMCG_TF_LOCKED flag is set); the server is woken when + * the worker blocks; has PF_UMCG_WORKER task flag set + * in task_struct. + * + * Both @idle_servers_ptr and @idle_workes_ptr are !NULL + * when running or calling sys_umcg_wait(). + * + * A worker's state can be: + * - RUNNING: is schedulable by the kernel, has a server + * assigned in @server_tid; + * - IDLE: not schedulable by the kernel; can be + * context-switched into via sys_umcg_wait; + * - BLOCKED: blocked in the kernel (e.g. on I/O). + * + * - UMCG servers: A server's state can be: + * - RUNNING: behaves like a "normal" task: is schedulable + * by the kernel, can block on I/O, etc. + * - IDLE: not schedulable by the kernel. + * + * See sys_umcg_ctl() documentation in kernel/sched/umcg.c for a detailed + * explanation of how UMCG task types are determined. + * + * See sys_umcg_wait() documentation in kernel/sched/ucmg.c for a detailed + * explanation of server/worker interactions. + * + * Once a UMCG task is registered, it may not change its type. + * + * The struct is aligned at 64 bytes to ensure that it fits into + * a single cache line. + */ +struct umcg_task { + /** + * @state: the current state of the UMCG task described by this struct. + * + * Readable/writable by both the kernel and the userspace. + * + * UMCG task state: + * bits 0 - 7: task state; + * bits 8 - 15: state flags; + * bits 16 - 23: reserved; must be zeroes; + * bits 24 - 31: for userspace use. + */ + uint32_t state; /* r/w */ + + /** + * @api_version: the version of UMCG API the userspace would like + * to use. Must be set before calling umcg_ctl + * and must not be changed afterwards. + */ + uint32_t api_version; /* r */ + + /** + * @server_tid: the TID of the server UMCG task that should be + * woken when this WORKER becomes BLOCKED. Can be zero. + * + * If this is a UMCG server, @server_tid should + * contain the TID of @self - it will be used to find + * the task_struct to wake when pulled from + * @idle_servers. + * + * Read-only for the kernel, read/write for the userspace. + */ + uint32_t server_tid; /* r */ + + /** + * @next_tid: the TID of the UMCG task that should be context-switched + * into in sys_umcg_wait(). Can be zero. + * + * Read-only for the kernel, read/write for the userspace. + */ + uint32_t next_tid; /* r */ + + /** + * @idle_servers_ptr: a single-linked list pointing to the list + * of idle servers. Can be NULL. + * + * Readable/writable by both the kernel and the userspace: the + * userspace adds items to the list, the kernel removes them. + * + * This is a single-linked list (stack): head->next->next->next->NULL. + * "next" nodes are idle_servers_ptr fields in struct umcg_task. + * + * Example: + * + * a running worker idle server 1 idle server 2 + * + * struct umct_task: struct umcg_task: struct umcg_task: + * state state state + * api_version api_version api_version + * ... ... ... + * idle_servers_ptr --> head --> idle_servers_ptr --> idle_servers_ptr --> NULL + * ... ... ... + * + * + * Due to the way struct umcg_task is aligned, idle_servers_ptr + * is aligned at 8 byte boundary, and so has its first byte as zero + * when it holds a valid pointer. + * + * When pulling idle servers from the list, the kernel marks nodes as + * "deleted" by ORing the node value (the pointer) with 1UL atomically. + * If a node is "deleted" (i.e. its value AND 1UL is not zero), + * the kernel proceeds to the next node. + * + * The kernel checks at most [nr_cpu_ids * 2] first nodes in the list. + * + * It is NOT considered an error if the kernel cannot find an idle + * server. + * + * The userspace is responsible for cleanup/gc (i.e. for actually + * removing nodes marked as "deleted" from the list). + */ + uint64_t idle_servers_ptr; /* r/w */ + + /** + * @idle_workers_ptr: a single-linked list pointing to the list + * of idle workers. Can be NULL. + * + * Readable/writable by both the kernel and the userspace: the + * kernel adds items to the list, the userspace removes them. + * + * The list (stack) is structured the same way as idle_servers_ptr + * above. The kernel pushes new nodes to the beginning of the list + * by assigning the current head value to the node's idle_workers_ptr + * and trying to atomically change the head to point to the new node. + * + * The kernel tries at most [nr_cpu_ids + 1] times to push a node + * onto the stack, after which the idle worker will sleep a short + * while before attempting to do so again. After several failed + * attempts the kernel will SIGKILL the worker. + */ + uint64_t idle_workers_ptr; /* r/w */ +} __attribute__((packed, aligned(8 * sizeof(__u64)))); + +/** + * enum umcg_ctl_flag - flags to pass to sys_umcg_ctl + * @UMCG_CTL_REGISTER: register the current task as a UMCG task + * @UMCG_CTL_UNREGISTER: unregister the current task as a UMCG task + * @UMCG_CTL_WORKER: register the current task as a UMCG worker + * + * See sys_umcg_ctl documentation for more details. + */ +enum umcg_ctl_flag { + UMCG_CTL_REGISTER = 0x00001, + UMCG_CTL_UNREGISTER = 0x00002, + UMCG_CTL_WORKER = 0x10000, +}; + +/** + * enum umcg_wait_flag - flags to pass to sys_umcg_wait + * @UMCG_WAIT_WAKE_ONLY: wake @self->next_tid, don't put @self to sleep; + * @UMCG_WAIT_WF_CURRENT_CPU: wake @self->next_tid on the current CPU + * (use WF_CURRENT_CPU); @UMCG_WAIT_WAKE_ONLY + * must be set. + */ +enum umcg_wait_flag { + UMCG_WAIT_WAKE_ONLY = 1, + UMCG_WAIT_WF_CURRENT_CPU = 2, +}; + +#endif /* _UAPI_LINUX_UMCG_H */ diff --git a/init/Kconfig b/init/Kconfig index a61c92066c2e..c15a50a61ba6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1662,6 +1662,16 @@ config MEMBARRIER If unsure, say Y. +config UMCG + bool "Enable User Managed Concurrency Groups API" + depends on X86_64 + default n + help + Enable User Managed Concurrency Groups API, which form the basis + for an in-process M:N userspace scheduling framework. + At the moment this is an experimental/RFC feature that is not + guaranteed to be backward-compatible. + config KALLSYMS bool "Load all symbols for debugging/ksymoops" if EXPERT default y diff --git a/kernel/exit.c b/kernel/exit.c index fd1c04193e18..dc8398558d87 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -744,6 +744,13 @@ void __noreturn do_exit(long code) if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); +#ifdef CONFIG_UMCG + if (unlikely(tsk->flags & PF_UMCG_WORKER)) + tsk->flags &= ~PF_UMCG_WORKER; + + tsk->umcg_task = NULL; +#endif + /* * If do_exit is called because this processes oopsed, it's possible * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 978fcfca5871..e4e481eee1b7 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -37,3 +37,4 @@ obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_PSI) += psi.o obj-$(CONFIG_SCHED_CORE) += core_sched.o +obj-$(CONFIG_UMCG) += umcg.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 293f5801bf81..f7ddeed72e30 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -26,6 +26,7 @@ #include "pelt.h" #include "smp.h" +#include "umcg.h" /* * Export tracepoints that act as a bare tracehook (ie: have no trace event @@ -3961,6 +3962,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->wake_entry.u_flags = CSD_TYPE_TTWU; p->migration_pending = NULL; #endif +#ifdef CONFIG_UMCG + p->umcg_task = NULL; + p->flags &= ~PF_UMCG_WORKER; +#endif } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -5975,10 +5980,14 @@ static inline void sched_submit_work(struct task_struct *tsk) * in the possible wakeup of a kworker and because wq_worker_sleeping() * requires it. */ - if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_UMCG_WORKER)) { preempt_disable(); if (task_flags & PF_WQ_WORKER) wq_worker_sleeping(tsk); +#ifdef CONFIG_UMCG + else if (task_flags & PF_UMCG_WORKER) + umcg_wq_worker_sleeping(tsk); +#endif else io_wq_worker_sleeping(tsk); preempt_enable_no_resched(); @@ -5997,9 +6006,13 @@ static inline void sched_submit_work(struct task_struct *tsk) static void sched_update_worker(struct task_struct *tsk) { - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_UMCG_WORKER)) { if (tsk->flags & PF_WQ_WORKER) wq_worker_running(tsk); +#ifdef CONFIG_UMCG + else if (tsk->flags & PF_UMCG_WORKER) + umcg_wq_worker_running(tsk); +#endif else io_wq_worker_running(tsk); } diff --git a/kernel/sched/umcg.c b/kernel/sched/umcg.c new file mode 100644 index 000000000000..f87c32974882 --- /dev/null +++ b/kernel/sched/umcg.c @@ -0,0 +1,485 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * User Managed Concurrency Groups (UMCG). + * + * See include/uapi/linux/umcg.h for more documentation. + */ + +#include <linux/syscalls.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/umcg.h> +#include <linux/freezer.h> + +#include "sched.h" +#include "umcg.h" + +static int umcg_validate_version(u32 api_version) +{ + if (api_version == 1) + return 0; + return 1; +} + +/** + * sys_umcg_ctl: (un)register the current task as a UMCG task. + * @flags: ORed values from enum umcg_ctl_flag; see below; + * @self: a pointer to struct umcg_task that describes this + * task and governs the behavior of sys_umcg_wait if + * registering; must be NULL if unregistering. + * + * @flags & UMCG_CTL_REGISTER: register a UMCG task: + * UMCG workers: + * - self->state must be UMCG_TASK_IDLE + * - @flags & UMCG_CTL_WORKER + * UMCG servers: + * - self->state must be UMCG_TASK_RUNNING + * - !(@flags & UMCG_CTL_WORKER) + * + * All tasks: + * - self->api_version must match one of the supported API + * versions + * - self->server_tid must be zero + * - self->next_tid must be zero + * + * If the conditions above are met, sys_umcg_ctl() immediately returns + * if the registered task is a server; a worker will be added to + * idle_workers_ptr, and the worker put to sleep; an idle server + * from idle_servers_ptr will be woken, if any. + * + * @flags == UMCG_CTL_UNREGISTER: unregister a UMCG task. If the current task + * is a UMCG worker, the userspace is responsible for waking its + * server (before or after calling sys_umcg_ctl). + * + * Return: + * 0 - success + * > 0 - the highest supported API version if @self->api_version + * is not supported (when registering) + * -EFAULT - failed to read @self + * -EINVAL - some other error occurred + */ +SYSCALL_DEFINE2(umcg_ctl, u32, flags, struct umcg_task __user *, self) +{ + struct umcg_task ut; + int ret; + + if (flags == UMCG_CTL_UNREGISTER) { + if (self || !current->umcg_task) + return -EINVAL; + + if (current->flags & PF_UMCG_WORKER) + current->flags &= ~PF_UMCG_WORKER; + + current->umcg_task = NULL; + return 0; + } + + /* Register the current task as a UMCG task. */ + if (!(flags & UMCG_CTL_REGISTER)) + return -EINVAL; + + flags &= ~UMCG_CTL_REGISTER; + if (flags && flags != UMCG_CTL_WORKER) + return -EINVAL; + + if (current->umcg_task) + return -EINVAL; + + if (copy_from_user(&ut, self, sizeof(ut))) + return -EFAULT; + + ret = umcg_validate_version(ut.api_version); + if (ret) + return ret; + + if (ut.server_tid || ut.next_tid) + return -EINVAL; + + if (flags == UMCG_CTL_WORKER) { + if (ut.state != UMCG_TASK_IDLE) + return -EINVAL; + + current->umcg_task = self; + current->flags |= PF_UMCG_WORKER; + + umcg_wq_worker_running(current); /* Will sleep. */ + return 0; + } + + /* This is a server task. */ + if (ut.state != UMCG_TASK_RUNNING) + return -EINVAL; + + current->umcg_task = self; + return 0; +} + +/* Sleep until interrupted or self.state becomes RUNNING or timeout expires. */ +static int umcg_idle_loop(u64 abs_timeout) +{ + struct umcg_task __user *self = current->umcg_task; + + if (abs_timeout) + return -EOPNOTSUPP; + + /* Unlock the worker, if locked. */ + if (current->flags & PF_UMCG_WORKER) { + u32 umcg_state; + + smp_mb(); /* Protect the read below. */ + if (get_user(umcg_state, &self->state)) + return -EFAULT; + + if ((umcg_state & UMCG_TF_LOCKED) && cmpxchg_user_32( + &self->state, &umcg_state, + umcg_state & ~UMCG_TF_LOCKED)) + return -EFAULT; + } + + while (true) { + u32 umcg_state; + + set_current_state(TASK_INTERRUPTIBLE); + + smp_mb(); /* Order with set_current_state() above. */ + if (get_user(umcg_state, &self->state)) { + set_current_state(TASK_RUNNING); + return -EFAULT; + } + + if ((umcg_state & UMCG_TF_STATE_MASK) == + UMCG_TASK_RUNNING) { + set_current_state(TASK_RUNNING); + return 0; + } + + freezable_schedule(); + + if (signal_pending(current)) + return -EINTR; + + if (get_user(umcg_state, &self->state)) + return -EFAULT; + + if ((umcg_state & UMCG_TF_STATE_MASK) == UMCG_TASK_RUNNING) + return 0; + } +} + +/* + * Try to wake up. May be called with preempt_disable set. + * + * Note: umcg_ttwu succeeds even if ttwu fails: see wait/wake state + * ordering logic documented in sys_umcg_wait() below. + */ +static int umcg_ttwu(u32 next_tid, int wake_flags) +{ + struct task_struct *next; + + rcu_read_lock(); + next = find_task_by_vpid(next_tid); + if (!next || !(READ_ONCE(next->umcg_task))) { + rcu_read_unlock(); + return -ESRCH; + } + + try_to_wake_up(next, TASK_NORMAL, wake_flags); /* Result ignored. */ + rcu_read_unlock(); + + return 0; +} + +/* + * At the moment, umcg_do_context_switch simply wakes up @next with + * WF_CURRENT_CPU and puts the current task to sleep. + * + * In the future an optimization will be added to adjust runtime accounting + * so that from the kernel scheduling perspective the two tasks are + * essentially treated as one. + */ +static int umcg_do_context_switch(u32 next_tid, u64 abs_timeout) +{ + struct task_struct *next; + + if (abs_timeout) + return -EOPNOTSUPP; + + rcu_read_lock(); + next = find_task_by_vpid(next_tid); + if (!next) { + rcu_read_unlock(); + return -ESRCH; + } + + /* TODO: instead of wake + sleep, do a context switch. */ + try_to_wake_up(next, TASK_NORMAL, WF_CURRENT_CPU); /* Result ignored. */ + rcu_read_unlock(); + + return umcg_idle_loop(abs_timeout); +} + +/** + * sys_umcg_wait: sleep the current task and/or wake another task. + * @flags: zero or a value from enum umcg_wait_flag. + * @abs_timeout: when to wake the task; zero for no timeout. NOT SUPPORTED yet. + * + * @self->state must be UMCG_TASK_IDLE (where @self is current->umcg_task + * if !(@flags & UMCG_WAIT_WAKE_ONLY). + * + * If @self->next_tid is not zero, it must point to an IDLE UMCG task blocked + * in sys_umcg_wait(). The userspace must have changed its state from IDLE to + * RUNNING before calling sys_umcg_wait() in the current task. This "next" + * task will be woken (context-switched-to on the fast path) when the current + * task is put to sleep. + * + * If this is a worker (PF_UMCG_WORKER is set), and @self->next_tid is zero, + * the server assigned to this worker (@self->server_tid) will be + * woken/switched-to; same rules apply (the server must be waiting in + * sys_umcg_wait(); its state must be RUNNING now). + * + * If @self->next_tid points to a UMCG worker, it must have its server_tid + * set, with the server blocked in sys_umcg_wait(). + * + * + * Note: wait/wake ordering: to avoid missing wakeups, the following + * state changes order is required: + * + * wait: the userspace marks the current task's UMCG state as IDLE + * and calls sys_umcg_wait(). + * wake: the userspace marks the wakee's UMCG state as RUNNING and + * calls sys_umcg_wait() with the wakee's TID in self->next_tid; + * + * To wake a umcg task, the kernel issues a single ttwu() call, ignoring + * the result. On the wait path the kernel carefully orders task + * state changes with umcg state checks to ensure the wakeup above + * is not lost. See umcg_idle_loop for details. + * + * Return: + * 0 - OK; + * -ETIMEDOUT - the timeout expired; + * -EFAULT - failed accessing struct umcg_task __user of the current + * task; + * -ESRCH - the task to wake not found or not a UMCG task; + * -EINVAL - another error happened (e.g. bad @flags, or the current + * task is not a UMCG task, etc.) + */ +SYSCALL_DEFINE2(umcg_wait, u32, flags, u64, abs_timeout) +{ + struct umcg_task __user *self = current->umcg_task; + u32 next_tid; + + if (!self) + return -EINVAL; + + if (get_user(next_tid, &self->next_tid)) + return -EFAULT; + + if (flags & UMCG_WAIT_WAKE_ONLY) { + if (!next_tid || abs_timeout) + return -EINVAL; + + flags &= ~UMCG_WAIT_WAKE_ONLY; + if (flags & ~UMCG_WAIT_WF_CURRENT_CPU) + return -EINVAL; + + return umcg_ttwu(next_tid, flags & UMCG_WAIT_WF_CURRENT_CPU ? + WF_CURRENT_CPU : 0); + } + + if (!next_tid && current->flags & PF_UMCG_WORKER) { + if (get_user(next_tid, &self->server_tid)) + return -EFAULT; + } + + if (next_tid) + return umcg_do_context_switch(next_tid, abs_timeout); + + return umcg_idle_loop(abs_timeout); +} + +#define umcg_die() \ +{ \ + pr_warn("UMCG: umcg_die: %s:%d\n", __FILE__, __LINE__); \ + force_sig(SIGSEGV); \ +} + +/* Try to wake the server; may be called within preempt_disable section. */ +static bool try_to_wake_server(struct umcg_task __user *ut_server, u32 server_tid) +{ + u32 state = UMCG_TASK_IDLE; + + if (WARN_ON(!(ut_server || server_tid))) + return false; + + if (!ut_server) { + struct task_struct *tsk; + + rcu_read_lock(); + tsk = find_task_by_vpid(server_tid); + if (tsk) + ut_server = READ_ONCE(tsk->umcg_task); + rcu_read_unlock(); + + if (!ut_server) + return false; + } + + if (!server_tid && get_user(server_tid, &ut_server->server_tid)) + return false; + + if (cmpxchg_user_32(&ut_server->state, &state, UMCG_TASK_RUNNING)) { + if (state != UMCG_TASK_RUNNING) + umcg_die(); /* The userspace broke the contract. */ + return false; + } + + /* TODO: make a smarter context switch when available. */ + return umcg_ttwu(server_tid, WF_CURRENT_CPU) == 0; +} + +/* + * Change the worker's state RUNNING => BLOCKED or BLOCKED => IDLE, depending + * on context (@sleeping). + * + * May be called with preempt_disable. + * + * Returns true to continue; false to abort. + */ +static bool wq_worker_change_state(struct umcg_task __user *ut_worker, + bool sleeping) +{ + u32 prev_state, next_state; + int ret; + + smp_mb(); /* Guard the read below. */ + if (get_user_nosleep(prev_state, &ut_worker->state)) { + umcg_die(); + return false; + } + + if (prev_state & UMCG_TF_LOCKED) + return false; + + if (sleeping) { + if ((prev_state & UMCG_TF_STATE_MASK) != + UMCG_TASK_RUNNING) + return false; + } else { + if ((prev_state & UMCG_TF_STATE_MASK) == + UMCG_TASK_RUNNING) { + /* + * Workers with servers attached should + * pass through; workers without servers + * should wait. + */ + u32 server_tid; + + if (get_user_nosleep(server_tid, + &ut_worker->server_tid)) { + umcg_die(); /* The userspace broke the conract. */ + return false; + } + + if (server_tid) + return false; + } + } + + next_state = prev_state & ~UMCG_TF_STATE_MASK; + next_state |= sleeping ? UMCG_TASK_BLOCKED : UMCG_TASK_IDLE; + + ret = cmpxchg_user_32(&ut_worker->state, &prev_state, next_state); + + if (!ret) + return true; + + umcg_die(); /* The userspace broke the contract. */ + return false; +} + +/* Called from sched_submit_work() with preempt_disable. */ +void umcg_wq_worker_sleeping(struct task_struct *tsk) +{ + struct umcg_task __user *ut_worker = tsk->umcg_task; + u32 server_tid; + + if (WARN_ONCE((tsk != current) || !ut_worker, "Invalid umcg worker")) + return; + + /* Step one: mark the worker as BLOCKED. */ + if (!wq_worker_change_state(ut_worker, true)) + return; + + /* Step two: wake the server, if any. */ + if (get_user_nosleep(server_tid, &ut_worker->server_tid)) { + umcg_die(); /* EFAULT */ + return; + } + + if (!server_tid) + return; + + /* TODO: make a smarter context switch when available. */ + try_to_wake_server(NULL, server_tid); +} + +/* Called from sched_update_worker(). May sleep. */ +void umcg_wq_worker_running(struct task_struct *tsk) +{ + struct umcg_task __user *ut_worker = tsk->umcg_task; + u64 head, popped_server; + + if (WARN_ONCE((tsk != current) || !ut_worker, "Invalid umcg worker")) + return; + + /* + * Remove the workqueue flag to avoid triggering + * umcg_wq_worker_sleeping. + */ + current->flags &= ~PF_UMCG_WORKER; + + /* Step 1: mark the worker as IDLE and add to the idle workers list. */ + if (!wq_worker_change_state(ut_worker, false)) + goto out; + + if (get_user(head, &ut_worker->idle_workers_ptr)) + goto die; /* EFAULT */ + + if (!head) + goto die; /* The userspace broke the conract. */ + + if (umcg_sll_push((u64 __user *)head, + (u64 __user *)&ut_worker->idle_workers_ptr, + nr_cpu_ids * 2 /* max number of times to push */)) + goto die; /* TODO: sleep+retry several times before dying. */ + + smp_mb(); /* Make sure steps 1 and 2 are ordered. */ + + /* Step 2: wake an idle server, if any. */ + if (get_user(head, &ut_worker->idle_servers_ptr)) + goto die; + + /* The number of servers should not exceed the number of CPUs much. */ + if (head && umcg_sll_pop((u64 __user *)head, &popped_server, + nr_cpu_ids * 2 /* max number of nodes to check*/)) + goto die; + + if (popped_server) { + struct umcg_task __user *ut_server = container_of( + (u64 *)popped_server, + struct umcg_task, idle_servers_ptr); + + try_to_wake_server(ut_server, 0); + } + + /* Step 3: sleep until woken by a server */ + umcg_idle_loop(0); + +out: + current->flags |= PF_UMCG_WORKER; + return; + +die: + umcg_die(); +} diff --git a/kernel/sched/umcg.h b/kernel/sched/umcg.h index 435531d751f2..619ba02be1d4 100644 --- a/kernel/sched/umcg.h +++ b/kernel/sched/umcg.h @@ -9,6 +9,19 @@ #include <asm/asm.h> #include <linux/atomic.h> +#ifdef CONFIG_UMCG + +struct task_struct; + +/* + * umcg_wq_worker_[sleeping|running] are called in core.c by + * sched_submit_work() and sched_update_worker(). + */ +void umcg_wq_worker_sleeping(struct task_struct *tsk); +void umcg_wq_worker_running(struct task_struct *tsk); + +#endif /* CONFIG_UMCG */ + /* TODO: move atomic operations below into arch/ headers */ static inline int __try_cmpxchg_user_32(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newval) diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 0ea8128468c3..cd1be6356e42 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -272,6 +272,10 @@ COND_SYSCALL(landlock_create_ruleset); COND_SYSCALL(landlock_add_rule); COND_SYSCALL(landlock_restrict_self); +/* kernel/sched/umcg.c */ +COND_SYSCALL(umcg_ctl); +COND_SYSCALL(umcg_wait); + /* arch/example/kernel/sys_example.c */ /* mm/fadvise.c */ -- 2.25.1