The following commit has been merged into the sched/urgent branch of tip: Commit-ID: b55945c500c5723992504aa03b362fab416863a6 Gitweb: https://git.kernel.org/tip/b55945c500c5723992504aa03b362fab416863a6 Author: Peter Zijlstra <peterz@xxxxxxxxxxxxx> AuthorDate: Wed, 23 Oct 2024 11:36:41 +02:00 Committer: Peter Zijlstra <peterz@xxxxxxxxxxxxx> CommitterDate: Wed, 23 Oct 2024 20:52:26 +02:00 sched: Fix pick_next_task_fair() vs try_to_wake_up() race Syzkaller robot reported KCSAN tripping over the ASSERT_EXCLUSIVE_WRITER(p->on_rq) in __block_task(). The report noted that both pick_next_task_fair() and try_to_wake_up() were concurrently trying to write to the same p->on_rq, violating the assertion -- even though both paths hold rq->__lock. The logical consequence is that both code paths end up holding a different rq->__lock. And looking through ttwu(), this is possible when the __block_task() 'p->on_rq = 0' store is visible to the ttwu() 'p->on_rq' load, which then assumes the task is not queued and continues to migrate it. Rearrange things such that __block_task() releases @p with the store and no code thereafter will use @p again. Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue") Reported-by: syzbot+0ec1e96c2cdf5c0e512a@xxxxxxxxxxxxxxxxxxxxxxxxx Reported-by: Kent Overstreet <kent.overstreet@xxxxxxxxx> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx> Tested-by: Marco Elver <elver@xxxxxxxxxx> Link: https://lkml.kernel.org/r/20241023093641.GE16066@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx --- kernel/sched/fair.c | 21 ++++++++++++++------- kernel/sched/sched.h | 34 ++++++++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c157d48..8796146 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5625,8 +5625,9 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) struct sched_entity *se = pick_eevdf(cfs_rq); if (se->sched_delayed) { dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); - SCHED_WARN_ON(se->sched_delayed); - SCHED_WARN_ON(se->on_rq); + /* + * Must not reference @se again, see __block_task(). + */ return NULL; } return se; @@ -7176,7 +7177,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) /* Fix-up what dequeue_task_fair() skipped */ hrtick_update(rq); - /* Fix-up what block_task() skipped. */ + /* + * Fix-up what block_task() skipped. + * + * Must be last, @p might not be valid after this. + */ __block_task(rq, p); } @@ -7193,12 +7198,14 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) util_est_dequeue(&rq->cfs, p); - if (dequeue_entities(rq, &p->se, flags) < 0) { - util_est_update(&rq->cfs, p, DEQUEUE_SLEEP); + util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); + if (dequeue_entities(rq, &p->se, flags) < 0) return false; - } - util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); + /* + * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED). + */ + hrtick_update(rq); return true; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 081519f..9f9d1cc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2769,8 +2769,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) static inline void __block_task(struct rq *rq, struct task_struct *p) { - WRITE_ONCE(p->on_rq, 0); - ASSERT_EXCLUSIVE_WRITER(p->on_rq); if (p->sched_contributes_to_load) rq->nr_uninterruptible++; @@ -2778,6 +2776,38 @@ static inline void __block_task(struct rq *rq, struct task_struct *p) atomic_inc(&rq->nr_iowait); delayacct_blkio_start(); } + + ASSERT_EXCLUSIVE_WRITER(p->on_rq); + + /* + * The moment this write goes through, ttwu() can swoop in and migrate + * this task, rendering our rq->__lock ineffective. + * + * __schedule() try_to_wake_up() + * LOCK rq->__lock LOCK p->pi_lock + * pick_next_task() + * pick_next_task_fair() + * pick_next_entity() + * dequeue_entities() + * __block_task() + * RELEASE p->on_rq = 0 if (p->on_rq && ...) + * break; + * + * ACQUIRE (after ctrl-dep) + * + * cpu = select_task_rq(); + * set_task_cpu(p, cpu); + * ttwu_queue() + * ttwu_do_activate() + * LOCK rq->__lock + * activate_task() + * STORE p->on_rq = 1 + * UNLOCK rq->__lock + * + * Callers must ensure to not reference @p after this -- we no longer + * own it. + */ + smp_store_release(&p->on_rq, 0); } extern void activate_task(struct rq *rq, struct task_struct *p, int flags);