On Mon, 2019-05-13 at 07:31 -0400, Konrad Rzeszutek Wilk wrote: > On May 13, 2019 5:20:37 AM EDT, Wanpeng Li <kernellwp@xxxxxxxxx> wrote: > > > > On Wed, 8 May 2019 at 02:57, Marcelo Tosatti <mtosatti@xxxxxxxxxx> > > wrote: > > > > > > > > > > > > Certain workloads perform poorly on KVM compared to baremetal > > > due to baremetal's ability to perform mwait on NEED_RESCHED > > > bit of task flags (therefore skipping the IPI). > > > > KVM supports expose mwait to the guest, if it can solve this? > > > > > There is a bit of problem with that. The host will see 100% CPU utilization even if the guest is idle and taking long naps.. > > Which depending on your dashboard can look like the machine is on fire. This can also be fixed. I have a patch that kind of expose proper information about the *real* utilization here if that would be help. > > CCing Ankur and Boris > > > > > Regards, > > Wanpeng Li > > > > > > > > > > > This patch introduces a configurable busy-wait delay before entering > > the > > > > > > architecture delay routine, allowing wakeup IPIs to be skipped > > > (if the IPI happens in that window). > > > > > > The real-life workload which this patch improves performance > > > is SAP HANA (by 5-10%) (for which case setting idle_spin to 30 > > > is sufficient). > > > > > > This patch improves the attached server.py and client.py example > > > as follows: > > > > > > Host: 31.814230202231556 > > > Guest: 38.17718765199993 (83 %) > > > Guest, idle_spin=50us: 33.317709898000004 (95 %) > > > Guest, idle_spin=220us: 32.27826551499999 (98 %) > > > > > > Signed-off-by: Marcelo Tosatti <mtosatti@xxxxxxxxxx> > > > > > > --- > > > kernel/sched/idle.c | 86 > > ++++++++++++++++++++++++++++++++++++++++++ > > > > > > 1 file changed, 86 insertions(+) > > > > > > diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c > > > index f5516bae0c1b..bca7656a7ea0 100644 > > > --- a/kernel/sched/idle.c > > > +++ b/kernel/sched/idle.c > > > @@ -216,6 +216,29 @@ static void cpuidle_idle_call(void) > > > rcu_idle_exit(); > > > } > > > > > > +static unsigned int spin_before_idle_us; > > > > > > +static void do_spin_before_idle(void) > > > +{ > > > + ktime_t now, end_spin; > > > + > > > + now = ktime_get(); > > > + end_spin = ktime_add_ns(now, spin_before_idle_us*1000); > > > + > > > + rcu_idle_enter(); > > > + local_irq_enable(); > > > + stop_critical_timings(); > > > + > > > + do { > > > + cpu_relax(); > > > + now = ktime_get(); > > > + } while (!tif_need_resched() && ktime_before(now, end_spin)); > > > + > > > + start_critical_timings(); > > > + rcu_idle_exit(); > > > + local_irq_disable(); > > > +} > > > + > > > /* > > > * Generic idle loop implementation > > > * > > > @@ -259,6 +282,8 @@ static void do_idle(void) > > > tick_nohz_idle_restart_tick(); > > > cpu_idle_poll(); > > > } else { > > > + if (spin_before_idle_us) > > > + do_spin_before_idle(); > > > cpuidle_idle_call(); > > > } > > > arch_cpu_idle_exit(); > > > @@ -465,3 +490,64 @@ const struct sched_class idle_sched_class = { > > > .switched_to = switched_to_idle, > > > .update_curr = update_curr_idle, > > > }; > > > + > > > + > > > +static ssize_t store_idle_spin(struct kobject *kobj, > > > + struct kobj_attribute *attr, > > > + const char *buf, size_t count) > > > +{ > > > + unsigned int val; > > > + > > > + if (kstrtouint(buf, 10, &val) < 0) > > > + return -EINVAL; > > > + > > > + if (val > USEC_PER_SEC) > > > + return -EINVAL; > > > + > > > + spin_before_idle_us = val; > > > + return count; > > > +} > > > + > > > +static ssize_t show_idle_spin(struct kobject *kobj, > > > + struct kobj_attribute *attr, > > > + char *buf) > > > +{ > > > + ssize_t ret; > > > + > > > + ret = sprintf(buf, "%d\n", spin_before_idle_us); > > > + > > > + return ret; > > > +} > > > + > > > +static struct kobj_attribute idle_spin_attr = > > > + __ATTR(idle_spin, 0644, show_idle_spin, store_idle_spin); > > > + > > > +static struct attribute *sched_attrs[] = { > > > + &idle_spin_attr.attr, > > > + NULL, > > > +}; > > > + > > > +static const struct attribute_group sched_attr_group = { > > > + .attrs = sched_attrs, > > > +}; > > > + > > > +static struct kobject *sched_kobj; > > > + > > > +static int __init sched_sysfs_init(void) > > > +{ > > > + int error; > > > + > > > + sched_kobj = kobject_create_and_add("sched", kernel_kobj); > > > + if (!sched_kobj) > > > + return -ENOMEM; > > > + > > > + error = sysfs_create_group(sched_kobj, &sched_attr_group); > > > + if (error) > > > + goto err; > > > + return 0; > > > + > > > +err: > > > + kobject_put(sched_kobj); > > > + return error; > > > +} > > > +postcore_initcall(sched_sysfs_init); > Amazon Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrer: Christian Schlaeger, Ralf Herbrich Ust-ID: DE 289 237 879 Eingetragen am Amtsgericht Charlottenburg HRB 149173 B