From: David Vernet <dvernet@xxxxxxxx> If set when calling scx_bpf_kick_cpu(), the invoking CPU will busy wait for the kicked cpu to enter the scheduler. See the following for example usage: https://github.com/sched-ext/scx/blob/main/scheds/c/scx_pair.bpf.c v2: - Updated to fit the updated kick_cpus_irq_workfn() implementation. - Include SCX_KICK_WAIT related information in debug dump. Signed-off-by: David Vernet <dvernet@xxxxxxxx> Reviewed-by: Tejun Heo <tj@xxxxxxxxxx> Signed-off-by: Tejun Heo <tj@xxxxxxxxxx> Acked-by: Josh Don <joshdon@xxxxxxxxxx> Acked-by: Hao Luo <haoluo@xxxxxxxxxx> Acked-by: Barret Rhoden <brho@xxxxxxxxxx> --- kernel/sched/core.c | 4 ++- kernel/sched/ext.c | 82 ++++++++++++++++++++++++++++++++++++++++---- kernel/sched/ext.h | 4 +++ kernel/sched/sched.h | 2 ++ 4 files changed, 85 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index de49b94844a9..d940c17dfe8a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6115,8 +6115,10 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) for_each_active_class(class) { p = class->pick_next_task(rq); - if (p) + if (p) { + scx_next_task_picked(rq, p, class); return p; + } } BUG(); /* The idle class should always have a runnable task. */ diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 80c313a56958..91c3d1851b45 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -579,6 +579,12 @@ enum scx_kick_flags { * task expires and the dispatch path is invoked. */ SCX_KICK_PREEMPT = 1LLU << 1, + + /* + * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will + * return after the target CPU finishes picking the next task. + */ + SCX_KICK_WAIT = 1LLU << 2, }; enum scx_tg_flags { @@ -713,6 +719,9 @@ static struct { #endif /* CONFIG_SMP */ +/* for %SCX_KICK_WAIT */ +static unsigned long __percpu *scx_kick_cpus_pnt_seqs; + /* * Direct dispatch marker. * @@ -2315,6 +2324,23 @@ static struct task_struct *pick_next_task_scx(struct rq *rq) return p; } +void scx_next_task_picked(struct rq *rq, struct task_struct *p, + const struct sched_class *active) +{ + lockdep_assert_rq_held(rq); + + if (!scx_enabled()) + return; +#ifdef CONFIG_SMP + /* + * Pairs with the smp_load_acquire() issued by a CPU in + * kick_cpus_irq_workfn() who is waiting for this CPU to perform a + * resched. + */ + smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); +#endif +} + #ifdef CONFIG_SMP static bool test_and_clear_cpu_idle(int cpu) @@ -3868,9 +3894,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) rq->curr->sched_class == &idle_sched_class) goto next; - seq_buf_printf(&s, "\nCPU %-4d: nr_run=%u flags=0x%x ops_qseq=%lu\n", + seq_buf_printf(&s, "\nCPU %-4d: nr_run=%u flags=0x%x ops_qseq=%lu pnt_seq=%lu\n", cpu, rq->scx.nr_running, rq->scx.flags, - rq->scx.ops_qseq); + rq->scx.ops_qseq, rq->scx.pnt_seq); seq_buf_printf(&s, " curr=%s[%d] class=%ps\n", rq->curr->comm, rq->curr->pid, rq->curr->sched_class); @@ -3883,6 +3909,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) if (!cpumask_empty(rq->scx.cpus_to_preempt)) seq_buf_printf(&s, " cpus_to_preempt: %*pb\n", cpumask_pr_args(rq->scx.cpus_to_preempt)); + if (!cpumask_empty(rq->scx.cpus_to_wait)) + seq_buf_printf(&s, " cpus_to_wait : %*pb\n", + cpumask_pr_args(rq->scx.cpus_to_wait)); if (rq->curr->sched_class == &ext_sched_class) scx_dump_task(&s, rq->curr, '*', now); @@ -4578,10 +4607,11 @@ static bool can_skip_idle_kick(struct rq *rq) return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_BALANCING); } -static void kick_one_cpu(s32 cpu, struct rq *this_rq) +static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs) { struct rq *rq = cpu_rq(cpu); struct scx_rq *this_scx = &this_rq->scx; + bool should_wait = false; unsigned long flags; raw_spin_rq_lock_irqsave(rq, flags); @@ -4597,12 +4627,20 @@ static void kick_one_cpu(s32 cpu, struct rq *this_rq) cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); } + if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { + pseqs[cpu] = rq->scx.pnt_seq; + should_wait = true; + } + resched_curr(rq); } else { cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); } raw_spin_rq_unlock_irqrestore(rq, flags); + + return should_wait; } static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq) @@ -4623,10 +4661,12 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) { struct rq *this_rq = this_rq(); struct scx_rq *this_scx = &this_rq->scx; + unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs); + bool should_wait = false; s32 cpu; for_each_cpu(cpu, this_scx->cpus_to_kick) { - kick_one_cpu(cpu, this_rq); + should_wait |= kick_one_cpu(cpu, this_rq, pseqs); cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); } @@ -4635,6 +4675,28 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) kick_one_cpu_if_idle(cpu, this_rq); cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); } + + if (!should_wait) + return; + + for_each_cpu(cpu, this_scx->cpus_to_wait) { + unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq; + + if (cpu != cpu_of(this_rq)) { + /* + * Pairs with smp_store_release() issued by this CPU in + * scx_next_task_picked() on the resched path. + * + * We busy-wait here to guarantee that no other task can + * be scheduled on our core before the target CPU has + * entered the resched path. + */ + while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu]) + cpu_relax(); + } + + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); + } } /** @@ -4700,6 +4762,11 @@ void __init init_sched_ext_class(void) BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); #endif + scx_kick_cpus_pnt_seqs = + __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, + __alignof__(scx_kick_cpus_pnt_seqs[0])); + BUG_ON(!scx_kick_cpus_pnt_seqs); + for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); @@ -4709,6 +4776,7 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL)); init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); } @@ -5038,8 +5106,8 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) if (flags & SCX_KICK_IDLE) { struct rq *target_rq = cpu_rq(cpu); - if (unlikely(flags & SCX_KICK_PREEMPT)) - scx_ops_error("PREEMPT cannot be used with SCX_KICK_IDLE"); + if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) + scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); if (raw_spin_rq_trylock(target_rq)) { if (can_skip_idle_kick(target_rq)) { @@ -5054,6 +5122,8 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) if (flags & SCX_KICK_PREEMPT) cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt); + if (flags & SCX_KICK_WAIT) + cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait); } irq_work_queue(&this_rq->scx.kick_cpus_irq_work); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 1017439dcc00..5db35f627ea3 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -29,6 +29,8 @@ static inline bool task_on_scx(const struct task_struct *p) return scx_enabled() && p->sched_class == &ext_sched_class; } +void scx_next_task_picked(struct rq *rq, struct task_struct *p, + const struct sched_class *active); void scx_tick(struct rq *rq); void init_scx_entity(struct sched_ext_entity *scx); void scx_pre_fork(struct task_struct *p); @@ -78,6 +80,8 @@ static inline const struct sched_class *next_active_class(const struct sched_cla #define scx_enabled() false #define scx_switched_all() false +static inline void scx_next_task_picked(struct rq *rq, struct task_struct *p, + const struct sched_class *active) {} static inline void scx_tick(struct rq *rq) {} static inline void scx_pre_fork(struct task_struct *p) {} static inline int scx_fork(struct task_struct *p) { return 0; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0ca2378bb252..c8cf6fbaed07 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -735,6 +735,8 @@ struct scx_rq { cpumask_var_t cpus_to_kick; cpumask_var_t cpus_to_kick_if_idle; cpumask_var_t cpus_to_preempt; + cpumask_var_t cpus_to_wait; + unsigned long pnt_seq; struct irq_work kick_cpus_irq_work; }; #endif /* CONFIG_SCHED_CLASS_EXT */ -- 2.44.0