Current NMI mechanism is to process all the handlers for each NMI. Because perf uses NMI, so GHES NMI handler runs unnecessarily for every perf NMI handling. This will be captured by PMU's PEBS (Precise Event Based Sampling) and disturb perf result. GHES NMIs are very rare because they are only used in extreme error situations or very frequent when machine is dying and error floods happen. So add a GHES NMI nice level via GHES platform device sysfs if it's > 0 and any other NMI (e.g. PMU NMI) has been handled for current NMI, then skip current GHES NMI handler. So next PMU NMI can be processed early and perf result is not distrubed by GHES NMI handler. We reply statistically on the property that GHES NMIs are unlikely to collide with perf NMIs, or they are frequent there will be enough of them that it doesn't matter. It's a heuristics that is not 100% correct, but a reasonable one, and it saves a lot of unnecessary work for every NMI. Test machines have HEST ACPI table installed and NMI notification set, test cmds are 'perf mem record -a sleep 1' and 'perf mem report'. Before applying patch (perf memory profile): On Intel Broadwell-4S: 0.63% 1 17910 LFB or LFB hit [k] intel_pstate_update_util 0.59% 1 16960 LFB or LFB hit [k] intel_pstate_update_util ... 0.30% 1 8722 L1 or L1 hit [k] ghes_notify_nmi On Intel Skylake-4S: 3.45% 1 20218 L1 hit [k] native_read_msr 1.21% 1 7078 LFB hit [k] intel_pstate_update_util ... 1.21% 1 7077 N/A miss [k] ghes_notify_nmi After applying patch and 'echo 1 > /sys/devices/platform/GHES.[0-9]*/nmi_nice': No GHES was showed up in perf memory profile. Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@xxxxxxxxx> Suggested-by: Ying Huang <ying.huang@xxxxxxxxx> Reported-by: Andi Kleen <andi.kleen@xxxxxxxxx> --- drivers/acpi/apei/ghes.c | 33 +++++++++++++++++++++++++++++++-- include/acpi/ghes.h | 2 ++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index afc3814f8a8b..394cb5286fb3 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -47,6 +47,7 @@ #include <linux/sched/clock.h> #include <linux/uuid.h> #include <linux/ras.h> +#include <linux/device.h> #include <acpi/actbl1.h> #include <acpi/ghes.h> @@ -129,6 +130,8 @@ static atomic_t ghes_estatus_cache_alloced; static int ghes_panic_timeout __read_mostly = 30; +static int ghes_nmi_nice; + static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn) { phys_addr_t paddr; @@ -929,11 +932,32 @@ static void __process_error(struct ghes *ghes) #endif } +static ssize_t nmi_nice_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", ghes_nmi_nice); +} + +static ssize_t nmi_nice_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + if (kstrtoint(buf, 0, &ghes_nmi_nice)) + return -EINVAL; + + return count; +} +static DEVICE_ATTR_RW(nmi_nice); + static int ghes_notify_nmi(unsigned int cmd, int handled, struct pt_regs *regs) { struct ghes *ghes; int sev, ret = NMI_DONE; + if (handled && ghes_nmi_nice > 0) + return NMI_DONE; + if (!atomic_add_unless(&ghes_in_nmi, 1, 1)) return ret; @@ -994,8 +1018,10 @@ static void ghes_nmi_add(struct ghes *ghes) len = ghes_esource_prealloc_size(ghes->generic); ghes_estatus_pool_expand(len); mutex_lock(&ghes_list_mutex); - if (list_empty(&ghes_nmi)) + if (list_empty(&ghes_nmi)) { register_nmi_handler(NMI_LOCAL, ghes_notify_nmi, 0, "ghes"); + device_create_file(ghes->dev, &dev_attr_nmi_nice); + } list_add_rcu(&ghes->list, &ghes_nmi); mutex_unlock(&ghes_list_mutex); } @@ -1006,8 +1032,10 @@ static void ghes_nmi_remove(struct ghes *ghes) mutex_lock(&ghes_list_mutex); list_del_rcu(&ghes->list); - if (list_empty(&ghes_nmi)) + if (list_empty(&ghes_nmi)) { + device_remove_file(ghes->dev, &dev_attr_nmi_nice); unregister_nmi_handler(NMI_LOCAL, "ghes"); + } mutex_unlock(&ghes_list_mutex); /* * To synchronize with NMI handler, ghes can only be @@ -1086,6 +1114,7 @@ static int ghes_probe(struct platform_device *ghes_dev) ghes = NULL; goto err; } + ghes->dev = &ghes_dev->dev; switch (generic->notify.type) { case ACPI_HEST_NOTIFY_POLLED: diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index 1624e2be485c..936903d3af16 100644 --- a/include/acpi/ghes.h +++ b/include/acpi/ghes.h @@ -29,6 +29,8 @@ struct ghes { struct timer_list timer; unsigned int irq; }; + + struct device *dev; }; struct ghes_estatus_node { -- 2.17.1