On May 13, 2019 5:20:37 AM EDT, Wanpeng Li <kernellwp@xxxxxxxxx> wrote: >On Wed, 8 May 2019 at 02:57, Marcelo Tosatti <mtosatti@xxxxxxxxxx> >wrote: >> >> >> Certain workloads perform poorly on KVM compared to baremetal >> due to baremetal's ability to perform mwait on NEED_RESCHED >> bit of task flags (therefore skipping the IPI). > >KVM supports expose mwait to the guest, if it can solve this? > There is a bit of problem with that. The host will see 100% CPU utilization even if the guest is idle and taking long naps.. Which depending on your dashboard can look like the machine is on fire. CCing Ankur and Boris >Regards, >Wanpeng Li > >> >> This patch introduces a configurable busy-wait delay before entering >the >> architecture delay routine, allowing wakeup IPIs to be skipped >> (if the IPI happens in that window). >> >> The real-life workload which this patch improves performance >> is SAP HANA (by 5-10%) (for which case setting idle_spin to 30 >> is sufficient). >> >> This patch improves the attached server.py and client.py example >> as follows: >> >> Host: 31.814230202231556 >> Guest: 38.17718765199993 (83 %) >> Guest, idle_spin=50us: 33.317709898000004 (95 %) >> Guest, idle_spin=220us: 32.27826551499999 (98 %) >> >> Signed-off-by: Marcelo Tosatti <mtosatti@xxxxxxxxxx> >> >> --- >> kernel/sched/idle.c | 86 >++++++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 86 insertions(+) >> >> diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c >> index f5516bae0c1b..bca7656a7ea0 100644 >> --- a/kernel/sched/idle.c >> +++ b/kernel/sched/idle.c >> @@ -216,6 +216,29 @@ static void cpuidle_idle_call(void) >> rcu_idle_exit(); >> } >> >> +static unsigned int spin_before_idle_us; >> >> +static void do_spin_before_idle(void) >> +{ >> + ktime_t now, end_spin; >> + >> + now = ktime_get(); >> + end_spin = ktime_add_ns(now, spin_before_idle_us*1000); >> + >> + rcu_idle_enter(); >> + local_irq_enable(); >> + stop_critical_timings(); >> + >> + do { >> + cpu_relax(); >> + now = ktime_get(); >> + } while (!tif_need_resched() && ktime_before(now, end_spin)); >> + >> + start_critical_timings(); >> + rcu_idle_exit(); >> + local_irq_disable(); >> +} >> + >> /* >> * Generic idle loop implementation >> * >> @@ -259,6 +282,8 @@ static void do_idle(void) >> tick_nohz_idle_restart_tick(); >> cpu_idle_poll(); >> } else { >> + if (spin_before_idle_us) >> + do_spin_before_idle(); >> cpuidle_idle_call(); >> } >> arch_cpu_idle_exit(); >> @@ -465,3 +490,64 @@ const struct sched_class idle_sched_class = { >> .switched_to = switched_to_idle, >> .update_curr = update_curr_idle, >> }; >> + >> + >> +static ssize_t store_idle_spin(struct kobject *kobj, >> + struct kobj_attribute *attr, >> + const char *buf, size_t count) >> +{ >> + unsigned int val; >> + >> + if (kstrtouint(buf, 10, &val) < 0) >> + return -EINVAL; >> + >> + if (val > USEC_PER_SEC) >> + return -EINVAL; >> + >> + spin_before_idle_us = val; >> + return count; >> +} >> + >> +static ssize_t show_idle_spin(struct kobject *kobj, >> + struct kobj_attribute *attr, >> + char *buf) >> +{ >> + ssize_t ret; >> + >> + ret = sprintf(buf, "%d\n", spin_before_idle_us); >> + >> + return ret; >> +} >> + >> +static struct kobj_attribute idle_spin_attr = >> + __ATTR(idle_spin, 0644, show_idle_spin, store_idle_spin); >> + >> +static struct attribute *sched_attrs[] = { >> + &idle_spin_attr.attr, >> + NULL, >> +}; >> + >> +static const struct attribute_group sched_attr_group = { >> + .attrs = sched_attrs, >> +}; >> + >> +static struct kobject *sched_kobj; >> + >> +static int __init sched_sysfs_init(void) >> +{ >> + int error; >> + >> + sched_kobj = kobject_create_and_add("sched", kernel_kobj); >> + if (!sched_kobj) >> + return -ENOMEM; >> + >> + error = sysfs_create_group(sched_kobj, &sched_attr_group); >> + if (error) >> + goto err; >> + return 0; >> + >> +err: >> + kobject_put(sched_kobj); >> + return error; >> +} >> +postcore_initcall(sched_sysfs_init);