The following commit has been merged into the sched/core branch of tip: Commit-ID: 2e0199df252a536a03f4cb0810324dff523d1e79 Gitweb: https://git.kernel.org/tip/2e0199df252a536a03f4cb0810324dff523d1e79 Author: Peter Zijlstra <peterz@xxxxxxxxxxxxx> AuthorDate: Thu, 23 May 2024 11:03:42 +02:00 Committer: Peter Zijlstra <peterz@xxxxxxxxxxxxx> CommitterDate: Sat, 17 Aug 2024 11:06:43 +02:00 sched/fair: Prepare exit/cleanup paths for delayed_dequeue When dequeue_task() is delayed it becomes possible to exit a task (or cgroup) that is still enqueued. Ensure things are dequeued before freeing. Thanks to Valentin for asking the obvious questions and making switched_from_fair() less weird. Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx> Reviewed-by: Valentin Schneider <vschneid@xxxxxxxxxx> Tested-by: Valentin Schneider <vschneid@xxxxxxxxxx> Link: https://lkml.kernel.org/r/20240727105029.631948434@xxxxxxxxxxxxx --- kernel/sched/fair.c | 59 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 13 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 37acd53..9a84903 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8342,7 +8342,21 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) static void task_dead_fair(struct task_struct *p) { - remove_entity_load_avg(&p->se); + struct sched_entity *se = &p->se; + + if (se->sched_delayed) { + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + if (se->sched_delayed) { + update_rq_clock(rq); + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + } + task_rq_unlock(rq, p, &rf); + } + + remove_entity_load_avg(se); } /* @@ -12854,10 +12868,22 @@ static void attach_task_cfs_rq(struct task_struct *p) static void switched_from_fair(struct rq *rq, struct task_struct *p) { detach_task_cfs_rq(p); + /* + * Since this is called after changing class, this is a little weird + * and we cannot use DEQUEUE_DELAYED. + */ + if (p->se.sched_delayed) { + dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP); + p->se.sched_delayed = 0; + if (sched_feat(DELAY_ZERO) && p->se.vlag > 0) + p->se.vlag = 0; + } } static void switched_to_fair(struct rq *rq, struct task_struct *p) { + SCHED_WARN_ON(p->se.sched_delayed); + attach_task_cfs_rq(p); set_task_max_allowed_capacity(p); @@ -13008,28 +13034,35 @@ void online_fair_sched_group(struct task_group *tg) void unregister_fair_sched_group(struct task_group *tg) { - unsigned long flags; - struct rq *rq; int cpu; destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(cpu) { - if (tg->se[cpu]) - remove_entity_load_avg(tg->se[cpu]); + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; + struct sched_entity *se = tg->se[cpu]; + struct rq *rq = cpu_rq(cpu); + + if (se) { + if (se->sched_delayed) { + guard(rq_lock_irqsave)(rq); + if (se->sched_delayed) { + update_rq_clock(rq); + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + } + list_del_leaf_cfs_rq(cfs_rq); + } + remove_entity_load_avg(se); + } /* * Only empty task groups can be destroyed; so we can speculatively * check on_list without danger of it being re-added. */ - if (!tg->cfs_rq[cpu]->on_list) - continue; - - rq = cpu_rq(cpu); - - raw_spin_rq_lock_irqsave(rq, flags); - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_rq_unlock_irqrestore(rq, flags); + if (cfs_rq->on_list) { + guard(rq_lock_irqsave)(rq); + list_del_leaf_cfs_rq(cfs_rq); + } } }