[RFC PATCH 4/4 v0.3] sched/umcg: RFC: implement UMCG syscalls

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Define struct umcg_task and sys_umcg_ctl/sys_umcg_wait syscalls.

This is another attempt at implementing UMCG, based on
discussion in https://lore.kernel.org/patchwork/cover/1433967/

Most of the "why" is covered here (some details are obsolete):
https://lore.kernel.org/patchwork/cover/1433967/#1632328

I'll update this commit message with more "why" when the general
approach is ACKed at a high level.

In this patch I used the approach suggested by peterz@ (should I add
a Suggested-by: tag?) in the discussion linked above;
specifically, only a single

struct umcg_task __user *umcg_task

pointer is added to struct task_struct.

Comments in include/uapi/linux/umcg.h and kernel/sched/umcg.c
provide many details on how UMCG syscalls are to be used.

What is NOT implemented yet:
- timeouts;
- preemption.

All the basics (wait/wake/swap, block/wake detection) seem to be
working.

v0.2->v0.3 changes:
- new protocol for working with idle workers and servers is used, to avoid
  spinning in the kernel;
- waking a UMCG task now does not require spinning.

Signed-off-by: Peter Oskolkov <posk@xxxxxxxxxx>
---
 arch/x86/entry/syscalls/syscall_64.tbl |   2 +
 include/linux/sched.h                  |   6 +
 include/linux/syscalls.h               |   4 +
 include/uapi/asm-generic/unistd.h      |   8 +-
 include/uapi/linux/umcg.h              | 259 +++++++++++++
 init/Kconfig                           |  10 +
 kernel/exit.c                          |   7 +
 kernel/sched/Makefile                  |   1 +
 kernel/sched/core.c                    |  17 +-
 kernel/sched/umcg.c                    | 485 +++++++++++++++++++++++++
 kernel/sched/umcg.h                    |  13 +
 kernel/sys_ni.c                        |   4 +
 12 files changed, 813 insertions(+), 3 deletions(-)
 create mode 100644 include/uapi/linux/umcg.h
 create mode 100644 kernel/sched/umcg.c

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index ce18119ea0d0..0c6c7fd72b0b 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -368,6 +368,8 @@
 444	common	landlock_create_ruleset	sys_landlock_create_ruleset
 445	common	landlock_add_rule	sys_landlock_add_rule
 446	common	landlock_restrict_self	sys_landlock_restrict_self
+447	common	umcg_ctl		sys_umcg_ctl
+448	common	umcg_wait		sys_umcg_wait

 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 50db9496c99d..185ad1cdde77 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -66,6 +66,7 @@ struct sighand_struct;
 struct signal_struct;
 struct task_delay_info;
 struct task_group;
+struct umcg_task;

 /*
  * Task state bitmask. NOTE! These bits are also
@@ -1223,6 +1224,10 @@ struct task_struct {
 	unsigned long rseq_event_mask;
 #endif

+#ifdef CONFIG_UMCG
+	struct umcg_task __user *umcg_task;
+#endif
+
 	struct tlbflush_unmap_batch	tlb_ubc;

 	union {
@@ -1599,6 +1604,7 @@ extern struct pid *cad_pid;
 #define PF_KTHREAD		0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE		0x00400000	/* Randomize virtual address space */
 #define PF_SWAPWRITE		0x00800000	/* Allowed to write to swap */
+#define PF_UMCG_WORKER		0x01000000	/* UMCG worker */
 #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
 #define PF_MEMALLOC_PIN		0x10000000	/* Allocation context constrained to zones which allow long term pinning. */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 050511e8f1f8..f3e1ef8d842f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -71,6 +71,7 @@ struct open_how;
 struct mount_attr;
 struct landlock_ruleset_attr;
 enum landlock_rule_type;
+struct umcg_task;

 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -1050,6 +1051,9 @@ asmlinkage long sys_landlock_create_ruleset(const struct landlock_ruleset_attr _
 asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type rule_type,
 		const void __user *rule_attr, __u32 flags);
 asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags);
+asmlinkage long sys_umcg_ctl(u32 flags, struct umcg_task __user *self);
+asmlinkage long sys_umcg_wait(u32 flags, u64 abs_timeout);
+

 /*
  * Architecture-specific system calls
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 6de5a7fc066b..1a4c9ac0e296 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -873,8 +873,14 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule)
 #define __NR_landlock_restrict_self 446
 __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self)

+#define __NR_umcg_ctl 447
+__SYSCALL(__NR_umcg_ctl, sys_umcg_ctl)
+#define __NR_umcg_wait 448
+__SYSCALL(__NR_umcg_wait, sys_umcg_wait)
+
+
 #undef __NR_syscalls
-#define __NR_syscalls 447
+#define __NR_syscalls 449

 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/umcg.h b/include/uapi/linux/umcg.h
new file mode 100644
index 000000000000..402974b475bf
--- /dev/null
+++ b/include/uapi/linux/umcg.h
@@ -0,0 +1,259 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_UMCG_H
+#define _UAPI_LINUX_UMCG_H
+
+#include <linux/limits.h>
+#include <linux/types.h>
+
+/*
+ * UMCG: User Managed Concurrency Groups.
+ *
+ * Syscalls (see kernel/sched/umcg.c):
+ *      sys_umcg_ctl()  - register/unregister UMCG tasks;
+ *      sys_umcg_wait() - wait/wake/context-switch.
+ *
+ * struct umcg_task (below): controls the state of UMCG tasks.
+ */
+
+/*
+ * UMCG task states, the first 8 bits. The states represent the user space
+ * point of view.
+ *
+ * UMCG tasks can be either RUNNING, i.e. doing useful work, or IDLE,
+ * i.e. have no work assigned to them and blocked in sys_umcg_wait().
+ *
+ * In addition, when a UMCG worker blocks in the kernel (e.g. on I/O),
+ * it is marked as BLOCKED; when a BLOCKED worker completes its blocking
+ * operation, it is marked as IDLE and added to idle_workers list (see
+ * struct umcg_task below), until a server "runs" it.
+ *
+ * UMCG servers continue to be considered as RUNNING even if they are blocked
+ * in the kernel in any way other than in sys_umcg_wait().
+ *
+ * State transitions:
+ *
+ * RUNNING => IDLE:   the current RUNNING task becomes IDLE by calling
+ *                    sys_umcg_wait();
+ * IDLE => RUNNING:   - another worker or server task called
+ *                      sys_umcg_wait() with self->next_tid pointing to the
+ *                      task transitioning from IDLE to RUNNING (mostly
+ *                      applies to workers and basic tasks);
+ *                    - the userspace marked and IDLE task as RUNNING and
+ *                      sent a signal to it (thus interrupting sys_umcg_wait);
+ *                    - servers: the kernel wakes an IDLE server from
+ *                      idle_servers list when a BLOCKED worker becomes IDLE
+ *                      (see below);
+ *                    - servers: the kernel wakes and IDLE server that
+ *                      is "attached" to a RUNNING worker when the worker
+ *                      becomes BLOCKED;
+ * RUNNING => BLOCKED: when a RUNNING UMCG worker blocks in the kernel,
+ *                     the kernel marks it as BLOCKED (and wakes its server);
+ * BLOCKED => IDLE:    when a BLOCKED UMCG worker finishes its blocking
+ *                     operation, the kernel marks it as IDLE, adds it to
+ *                     the list of idle workers (see struct umcg_task) and
+ *                     wakes an idle server from the list of idle servers, if
+ *                     available.
+ *
+ * Note 1: only the transitions listed above are possible; these state
+ *         transitions never happen:
+ *         - IDLE => BLOCKED         (never happen)
+ *         - BLOCKED => RUNNING      (never happen)
+ *
+ * Note 2: only UMCG workers (UMCG tasks registered with UMCG_CTL_WORKER
+ *         flag set) are subject to block/wake detection logic;
+ *
+ * Note 3: if a worker has UMC_TF_LOCKED state flag set, it behaves as
+ *         a server, i.e. the block/wake detection is disabled (this is a UMCG
+ *         equivalent of task_lock() or preempt_disable()). UMCG_TF_LOCKED
+ *         flag is cleared by the kernel when the worker goes to sleep in
+ *         umcg_wait().
+ *
+ * Note 4: changing the value of umcg_task.state field is the responsibility
+ *         of the party initiating the state change: when a state transition
+ *         is initiated by the userspace via a call to sys_umcg_wait(), it
+ *         is the userspace's responsibility to change the value of the
+ *         umcg_task.state field; when a state transition is initiated
+ *         by the kernel during worker block/wake handling, it is the kernel
+ *         who marks the worker as BLOCKED or IDLE, and the server as RUNNING.
+ */
+#define UMCG_TASK_NONE			0
+#define UMCG_TASK_RUNNING		1
+#define UMCG_TASK_IDLE			2
+#define UMCG_TASK_BLOCKED		3
+
+#define UMCG_TF_STATE_MASK		0xff
+
+/* UMCG task state flags, bits 8-15 */
+
+/*
+ * UMCG_TF_LOCKED: locked by the userspace; workers with UMCG_TF_LOCKED set
+ * do not become BLOCKED and do not wake their attached server.
+ */
+#define UMCG_TF_LOCKED			(1 << 8)
+
+/**
+ * struct umcg_task - controls the state of UMCG tasks.
+ *
+ * UMCG tasks can be:
+ *
+ * - UMCG workers: must have a UMCG server assigned when running (unless
+ *                 UMCG_TF_LOCKED flag is set); the server is woken when
+ *                 the worker blocks; has PF_UMCG_WORKER task flag set
+ *                 in task_struct.
+ *
+ *                 Both @idle_servers_ptr and @idle_workes_ptr are !NULL
+ *                 when running or calling sys_umcg_wait().
+ *
+ *                 A worker's state can be:
+ *                 - RUNNING: is schedulable by the kernel, has a server
+ *                            assigned in @server_tid;
+ *                 - IDLE: not schedulable by the kernel; can be
+ *                         context-switched into via sys_umcg_wait;
+ *                 - BLOCKED: blocked in the kernel (e.g. on I/O).
+ *
+ * - UMCG servers: A server's state can be:
+ *                 - RUNNING: behaves like a "normal" task: is schedulable
+ *                            by the kernel, can block on I/O, etc.
+ *                 - IDLE: not schedulable by the kernel.
+ *
+ * See sys_umcg_ctl() documentation in kernel/sched/umcg.c for a detailed
+ * explanation of how UMCG task types are determined.
+ *
+ * See sys_umcg_wait() documentation in kernel/sched/ucmg.c for a detailed
+ * explanation of server/worker interactions.
+ *
+ * Once a UMCG task is registered, it may not change its type.
+ *
+ * The struct is aligned at 64 bytes to ensure that it fits into
+ * a single cache line.
+ */
+struct umcg_task {
+	/**
+	 * @state: the current state of the UMCG task described by this struct.
+	 *
+	 * Readable/writable by both the kernel and the userspace.
+	 *
+	 * UMCG task state:
+	 *   bits  0 -  7: task state;
+	 *   bits  8 - 15: state flags;
+	 *   bits 16 - 23: reserved; must be zeroes;
+	 *   bits 24 - 31: for userspace use.
+	 */
+	uint32_t	state;			/* r/w */
+
+	/**
+	 * @api_version: the version of UMCG API the userspace would like
+	 *               to use. Must be set before calling umcg_ctl
+	 *               and must not be changed afterwards.
+	 */
+	uint32_t	api_version;		/* r   */
+
+	/**
+	 * @server_tid: the TID of the server UMCG task that should be
+	 *              woken when this WORKER becomes BLOCKED. Can be zero.
+	 *
+	 *              If this is a UMCG server, @server_tid should
+	 *              contain the TID of @self - it will be used to find
+	 *              the task_struct to wake when pulled from
+	 *              @idle_servers.
+	 *
+	 * Read-only for the kernel, read/write for the userspace.
+	 */
+	uint32_t	server_tid;		/* r   */
+
+	/**
+	 * @next_tid: the TID of the UMCG task that should be context-switched
+	 *            into in sys_umcg_wait(). Can be zero.
+	 *
+	 * Read-only for the kernel, read/write for the userspace.
+	 */
+	uint32_t	next_tid;		/* r   */
+
+	/**
+	 * @idle_servers_ptr: a single-linked list pointing to the list
+	 *                    of idle servers. Can be NULL.
+	 *
+	 * Readable/writable by both the kernel and the userspace: the
+	 * userspace adds items to the list, the kernel removes them.
+	 *
+	 * This is a single-linked list (stack): head->next->next->next->NULL.
+	 * "next" nodes are idle_servers_ptr fields in struct umcg_task.
+	 *
+	 * Example:
+	 *
+	 *  a running worker             idle server 1        idle server 2
+	 *
+	 * struct umct_task:             struct umcg_task:    struct umcg_task:
+	 *    state                         state                state
+	 *    api_version                   api_version          api_version
+	 *    ...                           ...                  ...
+	 *    idle_servers_ptr --> head --> idle_servers_ptr --> idle_servers_ptr --> NULL
+	 *    ...                           ...                  ...
+	 *
+	 *
+	 * Due to the way struct umcg_task is aligned, idle_servers_ptr
+	 * is aligned at 8 byte boundary, and so has its first byte as zero
+	 * when it holds a valid pointer.
+	 *
+	 * When pulling idle servers from the list, the kernel marks nodes as
+	 * "deleted" by ORing the node value (the pointer) with 1UL atomically.
+	 * If a node is "deleted" (i.e. its value AND 1UL is not zero),
+	 * the kernel proceeds to the next node.
+	 *
+	 * The kernel checks at most [nr_cpu_ids * 2] first nodes in the list.
+	 *
+	 * It is NOT considered an error if the kernel cannot find an idle
+	 * server.
+	 *
+	 * The userspace is responsible for cleanup/gc (i.e. for actually
+	 * removing nodes marked as "deleted" from the list).
+	 */
+	uint64_t	idle_servers_ptr;	/* r/w */
+
+	/**
+	 * @idle_workers_ptr: a single-linked list pointing to the list
+	 *                    of idle workers. Can be NULL.
+	 *
+	 * Readable/writable by both the kernel and the userspace: the
+	 * kernel adds items to the list, the userspace removes them.
+	 *
+	 * The list (stack) is structured the same way as idle_servers_ptr
+	 * above. The kernel pushes new nodes to the beginning of the list
+	 * by assigning the current head value to the node's idle_workers_ptr
+	 * and trying to atomically change the head to point to the new node.
+	 *
+	 * The kernel tries at most [nr_cpu_ids + 1] times to push a node
+	 * onto the stack, after which the idle worker will sleep a short
+	 * while before attempting to do so again. After several failed
+	 * attempts the kernel will SIGKILL the worker.
+	 */
+	uint64_t	idle_workers_ptr;	/* r/w */
+} __attribute__((packed, aligned(8 * sizeof(__u64))));
+
+/**
+ * enum umcg_ctl_flag - flags to pass to sys_umcg_ctl
+ * @UMCG_CTL_REGISTER:   register the current task as a UMCG task
+ * @UMCG_CTL_UNREGISTER: unregister the current task as a UMCG task
+ * @UMCG_CTL_WORKER:     register the current task as a UMCG worker
+ *
+ * See sys_umcg_ctl documentation for more details.
+ */
+enum umcg_ctl_flag {
+	UMCG_CTL_REGISTER	= 0x00001,
+	UMCG_CTL_UNREGISTER	= 0x00002,
+	UMCG_CTL_WORKER		= 0x10000,
+};
+
+/**
+ * enum umcg_wait_flag - flags to pass to sys_umcg_wait
+ * @UMCG_WAIT_WAKE_ONLY:      wake @self->next_tid, don't put @self to sleep;
+ * @UMCG_WAIT_WF_CURRENT_CPU: wake @self->next_tid on the current CPU
+ *                            (use WF_CURRENT_CPU); @UMCG_WAIT_WAKE_ONLY
+ *                            must be set.
+ */
+enum umcg_wait_flag {
+	UMCG_WAIT_WAKE_ONLY			= 1,
+	UMCG_WAIT_WF_CURRENT_CPU		= 2,
+};
+
+#endif /* _UAPI_LINUX_UMCG_H */
diff --git a/init/Kconfig b/init/Kconfig
index a61c92066c2e..c15a50a61ba6 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1662,6 +1662,16 @@ config MEMBARRIER

 	  If unsure, say Y.

+config UMCG
+	bool "Enable User Managed Concurrency Groups API"
+	depends on X86_64
+	default n
+	help
+	  Enable User Managed Concurrency Groups API, which form the basis
+	  for an in-process M:N userspace scheduling framework.
+	  At the moment this is an experimental/RFC feature that is not
+	  guaranteed to be backward-compatible.
+
 config KALLSYMS
 	bool "Load all symbols for debugging/ksymoops" if EXPERT
 	default y
diff --git a/kernel/exit.c b/kernel/exit.c
index fd1c04193e18..dc8398558d87 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -744,6 +744,13 @@ void __noreturn do_exit(long code)
 	if (unlikely(!tsk->pid))
 		panic("Attempted to kill the idle task!");

+#ifdef CONFIG_UMCG
+	if (unlikely(tsk->flags & PF_UMCG_WORKER))
+		tsk->flags &= ~PF_UMCG_WORKER;
+
+	tsk->umcg_task = NULL;
+#endif
+
 	/*
 	 * If do_exit is called because this processes oopsed, it's possible
 	 * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 978fcfca5871..e4e481eee1b7 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -37,3 +37,4 @@ obj-$(CONFIG_MEMBARRIER) += membarrier.o
 obj-$(CONFIG_CPU_ISOLATION) += isolation.o
 obj-$(CONFIG_PSI) += psi.o
 obj-$(CONFIG_SCHED_CORE) += core_sched.o
+obj-$(CONFIG_UMCG) += umcg.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 293f5801bf81..f7ddeed72e30 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,6 +26,7 @@

 #include "pelt.h"
 #include "smp.h"
+#include "umcg.h"

 /*
  * Export tracepoints that act as a bare tracehook (ie: have no trace event
@@ -3961,6 +3962,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->wake_entry.u_flags = CSD_TYPE_TTWU;
 	p->migration_pending = NULL;
 #endif
+#ifdef CONFIG_UMCG
+	p->umcg_task = NULL;
+	p->flags &= ~PF_UMCG_WORKER;
+#endif
 }

 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -5975,10 +5980,14 @@ static inline void sched_submit_work(struct task_struct *tsk)
 	 * in the possible wakeup of a kworker and because wq_worker_sleeping()
 	 * requires it.
 	 */
-	if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+	if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_UMCG_WORKER)) {
 		preempt_disable();
 		if (task_flags & PF_WQ_WORKER)
 			wq_worker_sleeping(tsk);
+#ifdef CONFIG_UMCG
+		else if (task_flags & PF_UMCG_WORKER)
+			umcg_wq_worker_sleeping(tsk);
+#endif
 		else
 			io_wq_worker_sleeping(tsk);
 		preempt_enable_no_resched();
@@ -5997,9 +6006,13 @@ static inline void sched_submit_work(struct task_struct *tsk)

 static void sched_update_worker(struct task_struct *tsk)
 {
-	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_UMCG_WORKER)) {
 		if (tsk->flags & PF_WQ_WORKER)
 			wq_worker_running(tsk);
+#ifdef CONFIG_UMCG
+		else if (tsk->flags & PF_UMCG_WORKER)
+			umcg_wq_worker_running(tsk);
+#endif
 		else
 			io_wq_worker_running(tsk);
 	}
diff --git a/kernel/sched/umcg.c b/kernel/sched/umcg.c
new file mode 100644
index 000000000000..f87c32974882
--- /dev/null
+++ b/kernel/sched/umcg.c
@@ -0,0 +1,485 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * User Managed Concurrency Groups (UMCG).
+ *
+ * See include/uapi/linux/umcg.h for more documentation.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/umcg.h>
+#include <linux/freezer.h>
+
+#include "sched.h"
+#include "umcg.h"
+
+static int umcg_validate_version(u32 api_version)
+{
+	if (api_version == 1)
+		return 0;
+	return 1;
+}
+
+/**
+ * sys_umcg_ctl: (un)register the current task as a UMCG task.
+ * @flags:       ORed values from enum umcg_ctl_flag; see below;
+ * @self:        a pointer to struct umcg_task that describes this
+ *               task and governs the behavior of sys_umcg_wait if
+ *               registering; must be NULL if unregistering.
+ *
+ * @flags & UMCG_CTL_REGISTER: register a UMCG task:
+ *         UMCG workers:
+ *              - self->state must be UMCG_TASK_IDLE
+ *              - @flags & UMCG_CTL_WORKER
+ *         UMCG servers:
+ *              - self->state must be UMCG_TASK_RUNNING
+ *              - !(@flags & UMCG_CTL_WORKER)
+ *
+ *         All tasks:
+ *              - self->api_version must match one of the supported API
+ *                versions
+ *              - self->server_tid must be zero
+ *              - self->next_tid must be zero
+ *
+ *         If the conditions above are met, sys_umcg_ctl() immediately returns
+ *         if the registered task is a server; a worker will be added to
+ *         idle_workers_ptr, and the worker put to sleep; an idle server
+ *         from idle_servers_ptr will be woken, if any.
+ *
+ * @flags == UMCG_CTL_UNREGISTER: unregister a UMCG task. If the current task
+ *           is a UMCG worker, the userspace is responsible for waking its
+ *           server (before or after calling sys_umcg_ctl).
+ *
+ * Return:
+ * 0                - success
+ * > 0              - the highest supported API version if @self->api_version
+ *                    is not supported (when registering)
+ * -EFAULT          - failed to read @self
+ * -EINVAL          - some other error occurred
+ */
+SYSCALL_DEFINE2(umcg_ctl, u32, flags, struct umcg_task __user *, self)
+{
+	struct umcg_task ut;
+	int ret;
+
+	if (flags == UMCG_CTL_UNREGISTER) {
+		if (self || !current->umcg_task)
+			return -EINVAL;
+
+		if (current->flags & PF_UMCG_WORKER)
+			current->flags &= ~PF_UMCG_WORKER;
+
+		current->umcg_task = NULL;
+		return 0;
+	}
+
+	/* Register the current task as a UMCG task. */
+	if (!(flags & UMCG_CTL_REGISTER))
+		return -EINVAL;
+
+	flags &= ~UMCG_CTL_REGISTER;
+	if (flags && flags != UMCG_CTL_WORKER)
+		return -EINVAL;
+
+	if (current->umcg_task)
+		return -EINVAL;
+
+	if (copy_from_user(&ut, self, sizeof(ut)))
+		return -EFAULT;
+
+	ret = umcg_validate_version(ut.api_version);
+	if (ret)
+		return ret;
+
+	if (ut.server_tid || ut.next_tid)
+		return -EINVAL;
+
+	if (flags == UMCG_CTL_WORKER) {
+		if (ut.state != UMCG_TASK_IDLE)
+			return -EINVAL;
+
+		current->umcg_task = self;
+		current->flags |= PF_UMCG_WORKER;
+
+		umcg_wq_worker_running(current);  /* Will sleep. */
+		return 0;
+	}
+
+	/* This is a server task. */
+	if (ut.state != UMCG_TASK_RUNNING)
+		return -EINVAL;
+
+	current->umcg_task = self;
+	return 0;
+}
+
+/* Sleep until interrupted or self.state becomes RUNNING or timeout expires. */
+static int umcg_idle_loop(u64 abs_timeout)
+{
+	struct umcg_task __user *self = current->umcg_task;
+
+	if (abs_timeout)
+		return -EOPNOTSUPP;
+
+	/* Unlock the worker, if locked. */
+	if (current->flags & PF_UMCG_WORKER) {
+		u32 umcg_state;
+
+		smp_mb();  /* Protect the read below. */
+		if (get_user(umcg_state, &self->state))
+			return -EFAULT;
+
+		if ((umcg_state & UMCG_TF_LOCKED) && cmpxchg_user_32(
+					&self->state, &umcg_state,
+					umcg_state & ~UMCG_TF_LOCKED))
+			return -EFAULT;
+	}
+
+	while (true) {
+		u32 umcg_state;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		smp_mb();  /* Order with set_current_state() above. */
+		if (get_user(umcg_state, &self->state)) {
+			set_current_state(TASK_RUNNING);
+			return -EFAULT;
+		}
+
+		if ((umcg_state & UMCG_TF_STATE_MASK) ==
+				UMCG_TASK_RUNNING) {
+			set_current_state(TASK_RUNNING);
+			return 0;
+		}
+
+		freezable_schedule();
+
+		if (signal_pending(current))
+			return -EINTR;
+
+		if (get_user(umcg_state, &self->state))
+			return -EFAULT;
+
+		if ((umcg_state & UMCG_TF_STATE_MASK) == UMCG_TASK_RUNNING)
+			return 0;
+	}
+}
+
+/*
+ * Try to wake up. May be called with preempt_disable set.
+ *
+ * Note: umcg_ttwu succeeds even if ttwu fails: see wait/wake state
+ *       ordering logic documented in sys_umcg_wait() below.
+ */
+static int umcg_ttwu(u32 next_tid, int wake_flags)
+{
+	struct task_struct *next;
+
+	rcu_read_lock();
+	next = find_task_by_vpid(next_tid);
+	if (!next || !(READ_ONCE(next->umcg_task))) {
+		rcu_read_unlock();
+		return -ESRCH;
+	}
+
+	try_to_wake_up(next, TASK_NORMAL, wake_flags);  /* Result ignored. */
+	rcu_read_unlock();
+
+	return 0;
+}
+
+/*
+ * At the moment, umcg_do_context_switch simply wakes up @next with
+ * WF_CURRENT_CPU and puts the current task to sleep.
+ *
+ * In the future an optimization will be added to adjust runtime accounting
+ * so that from the kernel scheduling perspective the two tasks are
+ * essentially treated as one.
+ */
+static int umcg_do_context_switch(u32 next_tid, u64 abs_timeout)
+{
+	struct task_struct *next;
+
+	if (abs_timeout)
+		return -EOPNOTSUPP;
+
+	rcu_read_lock();
+	next = find_task_by_vpid(next_tid);
+	if (!next) {
+		rcu_read_unlock();
+		return -ESRCH;
+	}
+
+	/* TODO: instead of wake + sleep, do a context switch. */
+	try_to_wake_up(next, TASK_NORMAL, WF_CURRENT_CPU);  /* Result ignored. */
+	rcu_read_unlock();
+
+	return umcg_idle_loop(abs_timeout);
+}
+
+/**
+ * sys_umcg_wait: sleep the current task and/or wake another task.
+ * @flags:        zero or a value from enum umcg_wait_flag.
+ * @abs_timeout:  when to wake the task; zero for no timeout. NOT SUPPORTED yet.
+ *
+ * @self->state must be UMCG_TASK_IDLE (where @self is current->umcg_task
+ * if !(@flags & UMCG_WAIT_WAKE_ONLY).
+ *
+ * If @self->next_tid is not zero, it must point to an IDLE UMCG task blocked
+ * in sys_umcg_wait(). The userspace must have changed its state from IDLE to
+ * RUNNING before calling sys_umcg_wait() in the current task. This "next"
+ * task will be woken (context-switched-to on the fast path) when the current
+ * task is put to sleep.
+ *
+ * If this is a worker (PF_UMCG_WORKER is set), and @self->next_tid is zero,
+ * the server assigned to this worker (@self->server_tid) will be
+ * woken/switched-to; same rules apply (the server must be waiting in
+ * sys_umcg_wait(); its state must be RUNNING now).
+ *
+ * If @self->next_tid points to a UMCG worker, it must have its server_tid
+ * set, with the server blocked in sys_umcg_wait().
+ *
+ *
+ * Note: wait/wake ordering: to avoid missing wakeups, the following
+ *       state changes order is required:
+ *
+ *       wait: the userspace marks the current task's UMCG state as IDLE
+ *             and calls sys_umcg_wait().
+ *       wake: the userspace marks the wakee's UMCG state as RUNNING and
+ *             calls sys_umcg_wait() with the wakee's TID in self->next_tid;
+ *
+ *       To wake a umcg task, the kernel issues a single ttwu() call, ignoring
+ *       the result. On the wait path the kernel carefully orders task
+ *       state changes with umcg state checks to ensure the wakeup above
+ *       is not lost. See umcg_idle_loop for details.
+ *
+ * Return:
+ * 0             - OK;
+ * -ETIMEDOUT    - the timeout expired;
+ * -EFAULT       - failed accessing struct umcg_task __user of the current
+ *                 task;
+ * -ESRCH        - the task to wake not found or not a UMCG task;
+ * -EINVAL       - another error happened (e.g. bad @flags, or the current
+ *                 task is not a UMCG task, etc.)
+ */
+SYSCALL_DEFINE2(umcg_wait, u32, flags, u64, abs_timeout)
+{
+	struct umcg_task __user *self = current->umcg_task;
+	u32 next_tid;
+
+	if (!self)
+		return -EINVAL;
+
+	if (get_user(next_tid, &self->next_tid))
+		return -EFAULT;
+
+	if (flags & UMCG_WAIT_WAKE_ONLY) {
+		if (!next_tid || abs_timeout)
+			return -EINVAL;
+
+		flags &= ~UMCG_WAIT_WAKE_ONLY;
+		if (flags & ~UMCG_WAIT_WF_CURRENT_CPU)
+			return -EINVAL;
+
+		return umcg_ttwu(next_tid, flags & UMCG_WAIT_WF_CURRENT_CPU ?
+				WF_CURRENT_CPU : 0);
+	}
+
+	if (!next_tid && current->flags & PF_UMCG_WORKER) {
+		if (get_user(next_tid, &self->server_tid))
+			return -EFAULT;
+	}
+
+	if (next_tid)
+		return umcg_do_context_switch(next_tid, abs_timeout);
+
+	return umcg_idle_loop(abs_timeout);
+}
+
+#define umcg_die() \
+{                                                                \
+	pr_warn("UMCG: umcg_die: %s:%d\n", __FILE__, __LINE__);  \
+	force_sig(SIGSEGV);                                      \
+}
+
+/* Try to wake the server; may be called within preempt_disable section. */
+static bool try_to_wake_server(struct umcg_task __user *ut_server, u32 server_tid)
+{
+	u32 state = UMCG_TASK_IDLE;
+
+	if (WARN_ON(!(ut_server || server_tid)))
+		return false;
+
+	if (!ut_server) {
+		struct task_struct *tsk;
+
+		rcu_read_lock();
+		tsk = find_task_by_vpid(server_tid);
+		if (tsk)
+			ut_server = READ_ONCE(tsk->umcg_task);
+		rcu_read_unlock();
+
+		if (!ut_server)
+			return false;
+	}
+
+	if (!server_tid && get_user(server_tid, &ut_server->server_tid))
+		return false;
+
+	if (cmpxchg_user_32(&ut_server->state, &state, UMCG_TASK_RUNNING)) {
+		if (state != UMCG_TASK_RUNNING)
+			umcg_die();  /* The userspace broke the contract. */
+		return false;
+	}
+
+	/* TODO: make a smarter context switch when available. */
+	return umcg_ttwu(server_tid, WF_CURRENT_CPU) == 0;
+}
+
+/*
+ * Change the worker's state RUNNING => BLOCKED or BLOCKED => IDLE, depending
+ * on context (@sleeping).
+ *
+ * May be called with preempt_disable.
+ *
+ * Returns true to continue; false to abort.
+ */
+static bool wq_worker_change_state(struct umcg_task __user *ut_worker,
+		bool sleeping)
+{
+	u32 prev_state, next_state;
+	int ret;
+
+	smp_mb();  /* Guard the read below. */
+	if (get_user_nosleep(prev_state, &ut_worker->state)) {
+		umcg_die();
+		return false;
+	}
+
+	if (prev_state & UMCG_TF_LOCKED)
+		return false;
+
+	if (sleeping) {
+		if ((prev_state & UMCG_TF_STATE_MASK) !=
+				UMCG_TASK_RUNNING)
+			return false;
+	} else {
+		if ((prev_state & UMCG_TF_STATE_MASK) ==
+				UMCG_TASK_RUNNING) {
+			/*
+			 * Workers with servers attached should
+			 * pass through; workers without servers
+			 * should wait.
+			 */
+			u32 server_tid;
+
+			if (get_user_nosleep(server_tid,
+						&ut_worker->server_tid)) {
+				umcg_die();  /* The userspace broke the conract. */
+				return false;
+			}
+
+			if (server_tid)
+				return false;
+		}
+	}
+
+	next_state = prev_state & ~UMCG_TF_STATE_MASK;
+	next_state |= sleeping ? UMCG_TASK_BLOCKED : UMCG_TASK_IDLE;
+
+	ret = cmpxchg_user_32(&ut_worker->state, &prev_state, next_state);
+
+	if (!ret)
+		return true;
+
+	umcg_die(); /* The userspace broke the contract. */
+	return false;
+}
+
+/* Called from sched_submit_work() with preempt_disable. */
+void umcg_wq_worker_sleeping(struct task_struct *tsk)
+{
+	struct umcg_task __user *ut_worker = tsk->umcg_task;
+	u32 server_tid;
+
+	if (WARN_ONCE((tsk != current) || !ut_worker, "Invalid umcg worker"))
+		return;
+
+	/* Step one: mark the worker as BLOCKED. */
+	if (!wq_worker_change_state(ut_worker, true))
+		return;
+
+	/* Step two: wake the server, if any. */
+	if (get_user_nosleep(server_tid, &ut_worker->server_tid)) {
+		umcg_die();  /* EFAULT */
+		return;
+	}
+
+	if (!server_tid)
+		return;
+
+	/* TODO: make a smarter context switch when available. */
+	try_to_wake_server(NULL, server_tid);
+}
+
+/* Called from sched_update_worker(). May sleep. */
+void umcg_wq_worker_running(struct task_struct *tsk)
+{
+	struct umcg_task __user *ut_worker = tsk->umcg_task;
+	u64 head, popped_server;
+
+	if (WARN_ONCE((tsk != current) || !ut_worker, "Invalid umcg worker"))
+		return;
+
+	/*
+	 * Remove the workqueue flag to avoid triggering
+	 * umcg_wq_worker_sleeping.
+	 */
+	current->flags &= ~PF_UMCG_WORKER;
+
+	/* Step 1: mark the worker as IDLE and add to the idle workers list. */
+	if (!wq_worker_change_state(ut_worker, false))
+		goto out;
+
+	if (get_user(head, &ut_worker->idle_workers_ptr))
+		goto die;  /* EFAULT */
+
+	if (!head)
+		goto die;  /* The userspace broke the conract. */
+
+	if (umcg_sll_push((u64 __user *)head,
+			(u64 __user *)&ut_worker->idle_workers_ptr,
+			nr_cpu_ids * 2 /* max number of times to push */))
+		goto die;  /* TODO: sleep+retry several times before dying. */
+
+	smp_mb();  /* Make sure steps 1 and 2 are ordered. */
+
+	/* Step 2: wake an idle server, if any. */
+	if (get_user(head, &ut_worker->idle_servers_ptr))
+		goto die;
+
+	/* The number of servers should not exceed the number of CPUs much. */
+	if (head && umcg_sll_pop((u64 __user *)head, &popped_server,
+				nr_cpu_ids * 2 /* max number of nodes to check*/))
+		goto die;
+
+	if (popped_server) {
+		struct umcg_task __user *ut_server = container_of(
+				(u64 *)popped_server,
+				struct umcg_task, idle_servers_ptr);
+
+		try_to_wake_server(ut_server, 0);
+	}
+
+	/* Step 3: sleep until woken by a server */
+	umcg_idle_loop(0);
+
+out:
+	current->flags |= PF_UMCG_WORKER;
+	return;
+
+die:
+	umcg_die();
+}
diff --git a/kernel/sched/umcg.h b/kernel/sched/umcg.h
index 435531d751f2..619ba02be1d4 100644
--- a/kernel/sched/umcg.h
+++ b/kernel/sched/umcg.h
@@ -9,6 +9,19 @@
 #include <asm/asm.h>
 #include <linux/atomic.h>

+#ifdef CONFIG_UMCG
+
+struct task_struct;
+
+/*
+ * umcg_wq_worker_[sleeping|running] are called in core.c by
+ * sched_submit_work() and sched_update_worker().
+ */
+void umcg_wq_worker_sleeping(struct task_struct *tsk);
+void umcg_wq_worker_running(struct task_struct *tsk);
+
+#endif  /* CONFIG_UMCG */
+
 /* TODO: move atomic operations below into arch/ headers */
 static inline int __try_cmpxchg_user_32(u32 *uval, u32 __user *uaddr,
 						u32 oldval, u32 newval)
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0ea8128468c3..cd1be6356e42 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -272,6 +272,10 @@ COND_SYSCALL(landlock_create_ruleset);
 COND_SYSCALL(landlock_add_rule);
 COND_SYSCALL(landlock_restrict_self);

+/* kernel/sched/umcg.c */
+COND_SYSCALL(umcg_ctl);
+COND_SYSCALL(umcg_wait);
+
 /* arch/example/kernel/sys_example.c */

 /* mm/fadvise.c */
--
2.25.1




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux