On Fri, Jun 14, 2024 at 12:48:37PM +0200, Vincent Guittot wrote: > On Fri, 14 Jun 2024 at 11:28, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote: > > > Vincent [5] pointed out a case where the idle load kick will fail to > > > run on an idle CPU since the IPI handler launching the ILB will check > > > for need_resched(). In such cases, the idle CPU relies on > > > newidle_balance() to pull tasks towards itself. > > > > Is this the need_resched() in _nohz_idle_balance() ? Should we change > > this to 'need_resched() && (rq->nr_running || rq->ttwu_pending)' or > > something long those lines? > > It's not only this but also in do_idle() as well which exits the loop > to look for tasks to schedule Is that really a problem? Reading the initial email the problem seems to be newidle balance, not hitting schedule. Schedule should be fairly quick if there's nothing to do, no? > > I mean, it's fairly trivial to figure out if there really is going to be > > work there. > > > > > Using an alternate flag instead of NEED_RESCHED to indicate a pending > > > IPI was suggested as the correct approach to solve this problem on the > > > same thread. > > > > So adding per-arch changes for this seems like something we shouldn't > > unless there really is no other sane options. > > > > That is, I really think we should start with something like the below > > and then fix any fallout from that. > > The main problem is that need_resched becomes somewhat meaningless > because it doesn't only mean "I need to resched a task" and we have > to add more tests around even for those not using polling True, however we already had some of that by having the wakeup list, that made nr_running less 'reliable'. The thing is, most architectures seem to have the TIF_POLLING_NRFLAG bit, even if their main idle routine isn't actually using it, much of the idle loop until it hits the arch idle will be having it set and will thus tickle these cases *sometimes*. > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > > index 0935f9d4bb7b..cfa45338ae97 100644 > > --- a/kernel/sched/core.c > > +++ b/kernel/sched/core.c > > @@ -5799,7 +5800,7 @@ static inline struct task_struct * > > __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) > > { > > const struct sched_class *class; > > - struct task_struct *p; > > + struct task_struct *p = NULL; > > > > /* > > * Optimization: we know that if all tasks are in the fair class we can > > @@ -5810,9 +5811,11 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) > > if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && > > rq->nr_running == rq->cfs.h_nr_running)) { > > > > - p = pick_next_task_fair(rq, prev, rf); > > - if (unlikely(p == RETRY_TASK)) > > - goto restart; > > + if (rq->nr_running) { > > How do you make the diff between a spurious need_resched() because of > polling and a cpu becoming idle ? isn't rq->nr_running null in both > cases ? Bah, true. It should also check current being idle, which then makes a mess of things again. Still, we shouldn't be calling newidle from idle, that's daft. I should probably not write code at 3am, but the below horror is what I came up with. --- diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0935f9d4bb7b..cfe8d3350819 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6343,19 +6344,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * Constants for the sched_mode argument of __schedule(). * * The mode argument allows RT enabled kernels to differentiate a - * preemption from blocking on an 'sleeping' spin/rwlock. Note that - * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to - * optimize the AND operation out and just check for zero. + * preemption from blocking on an 'sleeping' spin/rwlock. */ -#define SM_NONE 0x0 -#define SM_PREEMPT 0x1 -#define SM_RTLOCK_WAIT 0x2 - -#ifndef CONFIG_PREEMPT_RT -# define SM_MASK_PREEMPT (~0U) -#else -# define SM_MASK_PREEMPT SM_PREEMPT -#endif +#define SM_IDLE (-1) +#define SM_NONE 0 +#define SM_PREEMPT 1 +#define SM_RTLOCK_WAIT 2 /* * __schedule() is the main scheduler function. @@ -6396,11 +6390,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * * WARNING: must be called with preemption disabled! */ -static void __sched notrace __schedule(unsigned int sched_mode) +static void __sched notrace __schedule(int sched_mode) { struct task_struct *prev, *next; unsigned long *switch_count; unsigned long prev_state; + bool preempt = sched_mode > 0; struct rq_flags rf; struct rq *rq; int cpu; @@ -6409,13 +6404,13 @@ static void __sched notrace __schedule(unsigned int sched_mode) rq = cpu_rq(cpu); prev = rq->curr; - schedule_debug(prev, !!sched_mode); + schedule_debug(prev, preempt); if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) hrtick_clear(rq); local_irq_disable(); - rcu_note_context_switch(!!sched_mode); + rcu_note_context_switch(preempt); /* * Make sure that signal_pending_state()->signal_pending() below @@ -6449,7 +6444,12 @@ static void __sched notrace __schedule(unsigned int sched_mode) * that we form a control dependency vs deactivate_task() below. */ prev_state = READ_ONCE(prev->__state); - if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { + if (sched_mode == SM_IDLE) { + if (!rq->nr_running) { + next = prev; + goto picked; + } + } else if (!preempt && prev_state) { if (signal_pending_state(prev_state, prev)) { WRITE_ONCE(prev->__state, TASK_RUNNING); } else { @@ -6483,6 +6483,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) } next = pick_next_task(rq, prev, &rf); +picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); #ifdef CONFIG_SCHED_DEBUG @@ -6521,9 +6522,9 @@ static void __sched notrace __schedule(unsigned int sched_mode) ++*switch_count; migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); - trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); + trace_sched_switch(preempt, prev, next, prev_state); /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); @@ -6599,7 +6601,7 @@ static void sched_update_worker(struct task_struct *tsk) } } -static __always_inline void __schedule_loop(unsigned int sched_mode) +static __always_inline void __schedule_loop(int sched_mode) { do { preempt_disable(); @@ -6644,7 +6646,7 @@ void __sched schedule_idle(void) */ WARN_ON_ONCE(current->__state); do { - __schedule(SM_NONE); + __schedule(SM_IDLE); } while (need_resched()); }