On Fri, 23 Oct 2020 at 05:30, Josh Don <joshdon@xxxxxxxxxx> wrote: > > Busy polling loops in the kernel such as network socket poll and kvm > halt polling have performance problems related to process scheduler load > accounting. > > Both of the busy polling examples are opportunistic - they relinquish > the cpu if another thread is ready to run. This design, however, doesn't > extend to multiprocessor load balancing very well. The scheduler still > sees the busy polling cpu as 100% busy and will be less likely to put > another thread on that cpu. In other words, if all cores are 100% > utilized and some of them are running real workloads and some others are > running busy polling loops, newly woken up threads will not prefer the > busy polling cpus. System wide throughput and latency may suffer. > > This change allows the scheduler to detect busy polling cpus in order to > allow them to be more frequently considered for wake up balancing. > > This change also disables preemption for the duration of the busy > polling loop. This is important, as it ensures that if a polling thread > decides to end its poll to relinquish cpu to another thread, the polling > thread will actually exit the busy loop and potentially block. When it > later becomes runnable, it will have the opportunity to find an idle cpu > via wakeup cpu selection. > > Suggested-by: Xi Wang <xii@xxxxxxxxxx> > Signed-off-by: Josh Don <joshdon@xxxxxxxxxx> > Signed-off-by: Xi Wang <xii@xxxxxxxxxx> > --- > include/linux/sched.h | 5 +++ > kernel/sched/core.c | 94 +++++++++++++++++++++++++++++++++++++++++++ > kernel/sched/fair.c | 25 ++++++++---- > kernel/sched/sched.h | 2 + > 4 files changed, 119 insertions(+), 7 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index afe01e232935..80ef477e5a87 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1651,6 +1651,7 @@ extern int can_nice(const struct task_struct *p, const int nice); > extern int task_curr(const struct task_struct *p); > extern int idle_cpu(int cpu); > extern int available_idle_cpu(int cpu); > +extern int polling_cpu(int cpu); > extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); > extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); > extern void sched_set_fifo(struct task_struct *p); > @@ -2048,4 +2049,8 @@ int sched_trace_rq_nr_running(struct rq *rq); > > const struct cpumask *sched_trace_rd_span(struct root_domain *rd); > > +extern void prepare_to_busy_poll(void); > +extern int continue_busy_poll(void); > +extern void end_busy_poll(bool allow_resched); > + > #endif > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index 2d95dc3f4644..2783191d0bd4 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -5107,6 +5107,24 @@ int available_idle_cpu(int cpu) > return 1; > } > > +/** > + * polling_cpu - is a given CPU currently running a thread in a busy polling > + * loop that could be preempted if a new thread were to be scheduled? > + * @cpu: the CPU in question. > + * > + * Return: 1 if the CPU is currently polling. 0 otherwise. > + */ > +int polling_cpu(int cpu) > +{ > +#ifdef CONFIG_SMP > + struct rq *rq = cpu_rq(cpu); > + > + return unlikely(rq->busy_polling); > +#else > + return 0; > +#endif > +} > + > /** > * idle_task - return the idle task for a given CPU. > * @cpu: the processor in question. > @@ -7191,6 +7209,7 @@ void __init sched_init(void) > > rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); > #endif > + rq->busy_polling = 0; > #endif /* CONFIG_SMP */ > hrtick_rq_init(rq); > atomic_set(&rq->nr_iowait, 0); > @@ -7417,6 +7436,81 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) > > #endif > > +/* > + * Calling this function before entering a preemptible busy polling loop will > + * help the scheduler make better load balancing decisions. Wake up balance > + * will treat the polling cpu as idle. > + * > + * Preemption is disabled inside this function and re-enabled in > + * end_busy_poll(), thus the polling loop must periodically check > + * continue_busy_poll(). > + * > + * REQUIRES: prepare_to_busy_poll(), continue_busy_poll(), and end_busy_poll() > + * must be used together. > + */ > +void prepare_to_busy_poll(void) > +{ > + struct rq __maybe_unused *rq = this_rq(); > + unsigned long __maybe_unused flags; > + > + /* Preemption will be reenabled by end_busy_poll() */ > + preempt_disable(); > + > +#ifdef CONFIG_SMP > + raw_spin_lock_irqsave(&rq->lock, flags); > + /* preemption disabled; only one thread can poll at a time */ > + WARN_ON_ONCE(rq->busy_polling); > + rq->busy_polling++; > + raw_spin_unlock_irqrestore(&rq->lock, flags); > +#endif > +} > +EXPORT_SYMBOL(prepare_to_busy_poll); > + > +int continue_busy_poll(void) > +{ > + if (!single_task_running()) > + return 0; > + > + /* Important that we check this, since preemption is disabled */ > + if (need_resched()) > + return 0; > + > + return 1; > +} > +EXPORT_SYMBOL(continue_busy_poll); > + > +/* > + * Restore any state modified by prepare_to_busy_poll(), including re-enabling > + * preemption. > + * > + * @allow_resched: If true, this potentially calls schedule() as part of > + * enabling preemption. A busy poll loop can use false in order to have an > + * opportunity to block before rescheduling. > + */ > +void end_busy_poll(bool allow_resched) > +{ > +#ifdef CONFIG_SMP > + struct rq *rq = this_rq(); > + unsigned long flags; > + > + raw_spin_lock_irqsave(&rq->lock, flags); > + BUG_ON(!rq->busy_polling); /* not paired with prepare() */ > + rq->busy_polling--; > + raw_spin_unlock_irqrestore(&rq->lock, flags); > +#endif > + > + /* > + * preemption needs to be kept disabled between prepare_to_busy_poll() > + * and end_busy_poll(). > + */ > + BUG_ON(preemptible()); > + if (allow_resched) > + preempt_enable(); > + else > + preempt_enable_no_resched(); > +} > +EXPORT_SYMBOL(end_busy_poll); > + > #ifdef CONFIG_CGROUP_SCHED > /* task_group_lock serializes the addition/removal of task groups */ > static DEFINE_SPINLOCK(task_group_lock); > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 1a68a0536add..58e525c74cc6 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -5460,6 +5460,11 @@ static int sched_idle_cpu(int cpu) > { > return sched_idle_rq(cpu_rq(cpu)); > } > + > +static int sched_idle_or_polling_cpu(int cpu) > +{ > + return sched_idle_cpu(cpu) || polling_cpu(cpu); > +} > #endif > > /* > @@ -5880,6 +5885,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this > u64 latest_idle_timestamp = 0; > int least_loaded_cpu = this_cpu; > int shallowest_idle_cpu = -1; > + int found_polling = 0; > int i; > > /* Check if we have any choice: */ > @@ -5914,10 +5920,14 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this > shallowest_idle_cpu = i; > } > } else if (shallowest_idle_cpu == -1) { > + int polling = polling_cpu(i); > + > load = cpu_load(cpu_rq(i)); > - if (load < min_load) { > + if ((polling == found_polling && load < min_load) || > + (polling && !found_polling)) { This really looks like a horrible hack. This case is used to compare the load when there is no idle cpu. > min_load = load; > least_loaded_cpu = i; > + found_polling = polling; > } > } > } > @@ -6085,7 +6095,7 @@ static int select_idle_smt(struct task_struct *p, int target) > for_each_cpu(cpu, cpu_smt_mask(target)) { > if (!cpumask_test_cpu(cpu, p->cpus_ptr)) > continue; > - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) > + if (available_idle_cpu(cpu) || sched_idle_or_polling_cpu(cpu)) > return cpu; > } > > @@ -6149,7 +6159,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t > for_each_cpu_wrap(cpu, cpus, target) { > if (!--nr) > return -1; > - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) > + if (available_idle_cpu(cpu) || sched_idle_or_polling_cpu(cpu)) > break; > } > > @@ -6179,7 +6189,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) > for_each_cpu_wrap(cpu, cpus, target) { > unsigned long cpu_cap = capacity_of(cpu); > > - if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) > + if (!available_idle_cpu(cpu) && !sched_idle_or_polling_cpu(cpu)) > continue; > if (task_fits_capacity(p, cpu_cap)) > return cpu; > @@ -6223,14 +6233,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) > } > > symmetric: > - if (available_idle_cpu(target) || sched_idle_cpu(target)) > + if (available_idle_cpu(target) || sched_idle_or_polling_cpu(target)) > return target; > > /* > * If the previous CPU is cache affine and idle, don't be stupid: > */ > if (prev != target && cpus_share_cache(prev, target) && > - (available_idle_cpu(prev) || sched_idle_cpu(prev))) > + (available_idle_cpu(prev) || sched_idle_or_polling_cpu(prev))) > return prev; > > /* > @@ -6252,7 +6262,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) > if (recent_used_cpu != prev && > recent_used_cpu != target && > cpus_share_cache(recent_used_cpu, target) && > - (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && > + (available_idle_cpu(recent_used_cpu) || > + sched_idle_or_polling_cpu(recent_used_cpu)) && > cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { > /* > * Replace recent_used_cpu with prev as it is a potential > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index 28709f6b0975..45de468d0ffb 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -1003,6 +1003,8 @@ struct rq { > > /* This is used to determine avg_idle's max value */ > u64 max_idle_balance_cost; > + > + unsigned int busy_polling; > #endif /* CONFIG_SMP */ > > #ifdef CONFIG_IRQ_TIME_ACCOUNTING > -- > 2.29.0.rc1.297.gfa9743e501-goog >