On Thu, Oct 22, 2020 at 08:29:42PM -0700, Josh Don wrote: > Busy polling loops in the kernel such as network socket poll and kvm > halt polling have performance problems related to process scheduler load > accounting. AFAICT you're not actually fixing the load accounting issue at all. > This change also disables preemption for the duration of the busy > polling loop. This is important, as it ensures that if a polling thread > decides to end its poll to relinquish cpu to another thread, the polling > thread will actually exit the busy loop and potentially block. When it > later becomes runnable, it will have the opportunity to find an idle cpu > via wakeup cpu selection. At the cost of inducing a sleep+wake cycle; which is mucho expensive. So this could go either way. No data presented. > +void prepare_to_busy_poll(void) > +{ > + struct rq __maybe_unused *rq = this_rq(); > + unsigned long __maybe_unused flags; > + > + /* Preemption will be reenabled by end_busy_poll() */ > + preempt_disable(); > + > +#ifdef CONFIG_SMP > + raw_spin_lock_irqsave(&rq->lock, flags); > + /* preemption disabled; only one thread can poll at a time */ > + WARN_ON_ONCE(rq->busy_polling); > + rq->busy_polling++; > + raw_spin_unlock_irqrestore(&rq->lock, flags); > +#endif Explain to me the purpose of that rq->lock usage. > +} > +EXPORT_SYMBOL(prepare_to_busy_poll); _GPL > + > +int continue_busy_poll(void) > +{ > + if (!single_task_running()) > + return 0; Why? If there's more, we'll end up in the below condition anyway. > + > + /* Important that we check this, since preemption is disabled */ > + if (need_resched()) > + return 0; > + > + return 1; > +} > +EXPORT_SYMBOL(continue_busy_poll); _GPL > + > +/* > + * Restore any state modified by prepare_to_busy_poll(), including re-enabling > + * preemption. > + * > + * @allow_resched: If true, this potentially calls schedule() as part of > + * enabling preemption. A busy poll loop can use false in order to have an > + * opportunity to block before rescheduling. > + */ > +void end_busy_poll(bool allow_resched) > +{ > +#ifdef CONFIG_SMP > + struct rq *rq = this_rq(); > + unsigned long flags; > + > + raw_spin_lock_irqsave(&rq->lock, flags); > + BUG_ON(!rq->busy_polling); /* not paired with prepare() */ > + rq->busy_polling--; > + raw_spin_unlock_irqrestore(&rq->lock, flags); > +#endif Again, please explain this lock usage. > + > + /* > + * preemption needs to be kept disabled between prepare_to_busy_poll() > + * and end_busy_poll(). > + */ > + BUG_ON(preemptible()); > + if (allow_resched) > + preempt_enable(); > + else > + preempt_enable_no_resched(); NAK on @allow_resched > +} > +EXPORT_SYMBOL(end_busy_poll); _GPL > + > #ifdef CONFIG_CGROUP_SCHED > /* task_group_lock serializes the addition/removal of task groups */ > static DEFINE_SPINLOCK(task_group_lock); > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 1a68a0536add..58e525c74cc6 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -5460,6 +5460,11 @@ static int sched_idle_cpu(int cpu) > { > return sched_idle_rq(cpu_rq(cpu)); > } > + > +static int sched_idle_or_polling_cpu(int cpu) > +{ > + return sched_idle_cpu(cpu) || polling_cpu(cpu); > +} > #endif > > /* > @@ -5880,6 +5885,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this > u64 latest_idle_timestamp = 0; > int least_loaded_cpu = this_cpu; > int shallowest_idle_cpu = -1; > + int found_polling = 0; > int i; > > /* Check if we have any choice: */ > @@ -5914,10 +5920,14 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this > shallowest_idle_cpu = i; > } > } else if (shallowest_idle_cpu == -1) { > + int polling = polling_cpu(i); > + > load = cpu_load(cpu_rq(i)); > - if (load < min_load) { > + if ((polling == found_polling && load < min_load) || > + (polling && !found_polling)) { > min_load = load; > least_loaded_cpu = i; > + found_polling = polling; > } > } > } > @@ -6085,7 +6095,7 @@ static int select_idle_smt(struct task_struct *p, int target) > for_each_cpu(cpu, cpu_smt_mask(target)) { > if (!cpumask_test_cpu(cpu, p->cpus_ptr)) > continue; > - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) > + if (available_idle_cpu(cpu) || sched_idle_or_polling_cpu(cpu)) > return cpu; > } > > @@ -6149,7 +6159,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t > for_each_cpu_wrap(cpu, cpus, target) { > if (!--nr) > return -1; > - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) > + if (available_idle_cpu(cpu) || sched_idle_or_polling_cpu(cpu)) > break; > } > > @@ -6179,7 +6189,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) > for_each_cpu_wrap(cpu, cpus, target) { > unsigned long cpu_cap = capacity_of(cpu); > > - if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) > + if (!available_idle_cpu(cpu) && !sched_idle_or_polling_cpu(cpu)) > continue; > if (task_fits_capacity(p, cpu_cap)) > return cpu; > @@ -6223,14 +6233,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) > } > > symmetric: > - if (available_idle_cpu(target) || sched_idle_cpu(target)) > + if (available_idle_cpu(target) || sched_idle_or_polling_cpu(target)) > return target; > > /* > * If the previous CPU is cache affine and idle, don't be stupid: > */ > if (prev != target && cpus_share_cache(prev, target) && > - (available_idle_cpu(prev) || sched_idle_cpu(prev))) > + (available_idle_cpu(prev) || sched_idle_or_polling_cpu(prev))) > return prev; > > /* > @@ -6252,7 +6262,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) > if (recent_used_cpu != prev && > recent_used_cpu != target && > cpus_share_cache(recent_used_cpu, target) && > - (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && > + (available_idle_cpu(recent_used_cpu) || > + sched_idle_or_polling_cpu(recent_used_cpu)) && > cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { > /* > * Replace recent_used_cpu with prev as it is a potential None of this affects load-tracking > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index 28709f6b0975..45de468d0ffb 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -1003,6 +1003,8 @@ struct rq { > > /* This is used to determine avg_idle's max value */ > u64 max_idle_balance_cost; > + > + unsigned int busy_polling; This is a good location, cache-wise, because? > #endif /* CONFIG_SMP */ > > #ifdef CONFIG_IRQ_TIME_ACCOUNTING