This is required in order to be able to enable interrupts in the next patch. This is limited to PREEMPT_RT to avoid adding potentially measurable overhead to the non-RT case (requiring a double switch when pulling a task onto a newly idle cpu). update_misfit_status() is factored out for the PREEMPT_RT case, to ensure that the misfit status is kept consistent before dropping the lock. Signed-off-by: Scott Wood <swood@xxxxxxxxxx> --- v2: Use a balance callback, and limit to PREEMPT_RT kernel/sched/fair.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 794c2cb945f8..ff369c38a5b5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5660,6 +5660,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP +static const bool newidle_balance_in_callback = IS_ENABLED(CONFIG_PREEMPT_RT); +static DEFINE_PER_CPU(struct callback_head, rebalance_head); + /* Working cpumask for: load_balance, load_balance_newidle. */ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); @@ -10549,7 +10552,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * 0 - failed, no new tasks * > 0 - success, new (fair) tasks present */ -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +static int do_newidle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; @@ -10557,7 +10560,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) int pulled_task = 0; u64 curr_cost = 0; - update_misfit_status(NULL, this_rq); + if (!newidle_balance_in_callback) + update_misfit_status(NULL, this_rq); + /* * We must set idle_stamp _before_ calling idle_balance(), such that we * measure the duration of idle_balance() as idle time. @@ -10576,7 +10581,8 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) * further scheduler activity on it and we're being very careful to * re-start the picking loop. */ - rq_unpin_lock(this_rq, rf); + if (!newidle_balance_in_callback) + rq_unpin_lock(this_rq, rf); if (this_rq->avg_idle < sysctl_sched_migration_cost || !READ_ONCE(this_rq->rd->overload)) { @@ -10655,11 +10661,31 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) if (pulled_task) this_rq->idle_stamp = 0; - rq_repin_lock(this_rq, rf); + if (!newidle_balance_in_callback) + rq_repin_lock(this_rq, rf); return pulled_task; } +static void newidle_balance_cb(struct rq *this_rq) +{ + update_rq_clock(this_rq); + do_newidle_balance(this_rq, NULL); +} + +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +{ + if (newidle_balance_in_callback) { + update_misfit_status(NULL, this_rq); + queue_balance_callback(this_rq, + &per_cpu(rebalance_head, this_rq->cpu), + newidle_balance_cb); + return 0; + } + + return do_newidle_balance(this_rq, rf); +} + /* * run_rebalance_domains is triggered when needed from the scheduler tick. * Also triggered for nohz idle balancing (with nohz_balancing_kick set). -- 2.27.0