On Thu, 29 Apr 2021 at 01:28, Scott Wood <swood@xxxxxxxxxx> wrote: > > This is required in order to be able to enable interrupts in the next > patch. This is limited to PREEMPT_RT to avoid adding potentially > measurable overhead to the non-RT case (requiring a double switch when > pulling a task onto a newly idle cpu). IIUC, only the newidle_balance is a problem and not the idle load balance that runs softirq. In this case, why not skipping newidle_balance entirely in case of preempt_rt and kick an idle load balance instead as you switch to idle thread context anyway > > update_misfit_status() is factored out for the PREEMPT_RT case, to ensure > that the misfit status is kept consistent before dropping the lock. > > Signed-off-by: Scott Wood <swood@xxxxxxxxxx> > --- > v2: Use a balance callback, and limit to PREEMPT_RT > > kernel/sched/fair.c | 34 ++++++++++++++++++++++++++++++---- > 1 file changed, 30 insertions(+), 4 deletions(-) > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 794c2cb945f8..ff369c38a5b5 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -5660,6 +5660,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) > > #ifdef CONFIG_SMP > > +static const bool newidle_balance_in_callback = IS_ENABLED(CONFIG_PREEMPT_RT); > +static DEFINE_PER_CPU(struct callback_head, rebalance_head); > + > /* Working cpumask for: load_balance, load_balance_newidle. */ > DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); > DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); > @@ -10549,7 +10552,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } > * 0 - failed, no new tasks > * > 0 - success, new (fair) tasks present > */ > -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) > +static int do_newidle_balance(struct rq *this_rq, struct rq_flags *rf) > { > unsigned long next_balance = jiffies + HZ; > int this_cpu = this_rq->cpu; > @@ -10557,7 +10560,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) > int pulled_task = 0; > u64 curr_cost = 0; > > - update_misfit_status(NULL, this_rq); > + if (!newidle_balance_in_callback) > + update_misfit_status(NULL, this_rq); > + > /* > * We must set idle_stamp _before_ calling idle_balance(), such that we > * measure the duration of idle_balance() as idle time. > @@ -10576,7 +10581,8 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) > * further scheduler activity on it and we're being very careful to > * re-start the picking loop. > */ > - rq_unpin_lock(this_rq, rf); > + if (!newidle_balance_in_callback) > + rq_unpin_lock(this_rq, rf); > > if (this_rq->avg_idle < sysctl_sched_migration_cost || > !READ_ONCE(this_rq->rd->overload)) { > @@ -10655,11 +10661,31 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) > if (pulled_task) > this_rq->idle_stamp = 0; > > - rq_repin_lock(this_rq, rf); > + if (!newidle_balance_in_callback) > + rq_repin_lock(this_rq, rf); > > return pulled_task; > } > > +static void newidle_balance_cb(struct rq *this_rq) > +{ > + update_rq_clock(this_rq); > + do_newidle_balance(this_rq, NULL); > +} > + > +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) > +{ > + if (newidle_balance_in_callback) { > + update_misfit_status(NULL, this_rq); > + queue_balance_callback(this_rq, > + &per_cpu(rebalance_head, this_rq->cpu), > + newidle_balance_cb); > + return 0; > + } > + > + return do_newidle_balance(this_rq, rf); > +} > + > /* > * run_rebalance_domains is triggered when needed from the scheduler tick. > * Also triggered for nohz idle balancing (with nohz_balancing_kick set). > -- > 2.27.0 >