The following commit has been merged into the sched/core branch of tip: Commit-ID: abc158c82ae555078aa5dd2d8407c3df0f868904 Gitweb: https://git.kernel.org/tip/abc158c82ae555078aa5dd2d8407c3df0f868904 Author: Peter Zijlstra <peterz@xxxxxxxxxxxxx> AuthorDate: Thu, 23 May 2024 10:55:59 +02:00 Committer: Peter Zijlstra <peterz@xxxxxxxxxxxxx> CommitterDate: Sat, 17 Aug 2024 11:06:42 +02:00 sched: Prepare generic code for delayed dequeue While most of the delayed dequeue code can be done inside the sched_class itself, there is one location where we do not have an appropriate hook, namely ttwu_runnable(). Add an ENQUEUE_DELAYED call to the on_rq path to deal with waking delayed dequeue tasks. Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx> Reviewed-by: Valentin Schneider <vschneid@xxxxxxxxxx> Tested-by: Valentin Schneider <vschneid@xxxxxxxxxx> Link: https://lkml.kernel.org/r/20240727105029.200000445@xxxxxxxxxxxxx --- include/linux/sched.h | 1 + kernel/sched/core.c | 14 +++++++++++++- kernel/sched/sched.h | 2 ++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c1b4ee..f4a648e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -544,6 +544,7 @@ struct sched_entity { struct list_head group_node; unsigned int on_rq; + unsigned int sched_delayed; u64 exec_start; u64 sum_exec_runtime; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6c59548..7356464 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2036,6 +2036,8 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { + SCHED_WARN_ON(flags & DEQUEUE_SLEEP); + WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ASSERT_EXCLUSIVE_WRITER(p->on_rq); @@ -3689,12 +3691,14 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) rq = __task_rq_lock(p, &rf); if (task_on_rq_queued(p)) { + update_rq_clock(rq); + if (p->se.sched_delayed) + enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED); if (!task_on_cpu(rq, p)) { /* * When on_rq && !on_cpu the task is preempted, see if * it should preempt the task that is current now. */ - update_rq_clock(rq); wakeup_preempt(rq, p, wake_flags); } ttwu_do_wakeup(p); @@ -4074,11 +4078,16 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * case the whole 'p->on_rq && ttwu_runnable()' case below * without taking any locks. * + * Specifically, given current runs ttwu() we must be before + * schedule()'s block_task(), as such this must not observe + * sched_delayed. + * * In particular: * - we rely on Program-Order guarantees for all the ordering, * - we're serialized against set_special_state() by virtue of * it disabling IRQs (this allows not taking ->pi_lock). */ + SCHED_WARN_ON(p->se.sched_delayed); if (!ttwu_state_match(p, state, &success)) goto out; @@ -4370,6 +4379,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.slice = sysctl_sched_base_slice; INIT_LIST_HEAD(&p->se.group_node); + /* A delayed task cannot be in clone(). */ + SCHED_WARN_ON(p->se.sched_delayed); + #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 69ab3b0..ffca977 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2253,6 +2253,7 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ +#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 @@ -2268,6 +2269,7 @@ extern const u32 sched_prio_to_wmult[40]; #endif #define ENQUEUE_INITIAL 0x80 #define ENQUEUE_MIGRATING 0x100 +#define ENQUEUE_DELAYED 0x200 #define RETRY_TASK ((void *)-1UL)