Find CPU caches in the ACPI PPTT and add CPU EDAC device and EDAC device blocks for the caches found. For the firmware-first error handling, add an interface in the ghes_edac, enable to report the CPU corrected error count for a CPU core to the user-space through the CPU EDAC device. Suggested-by: James Morse <james.morse@xxxxxxx> Signed-off-by: Jonathan Cameron <jonathan.cameron@xxxxxxxxxx> Signed-off-by: Shiju Jose <shiju.jose@xxxxxxxxxx> --- drivers/edac/Kconfig | 10 +++ drivers/edac/ghes_edac.c | 135 +++++++++++++++++++++++++++++++++++++++ include/acpi/ghes.h | 27 ++++++++ 3 files changed, 172 insertions(+) diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 7a47680d6f07..3a0d8d134dcc 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -74,6 +74,16 @@ config EDAC_GHES In doubt, say 'Y'. +config EDAC_GHES_CPU_ERROR + bool "EDAC device for reporting firmware-first BIOS detected CPU error count" + depends on EDAC_GHES && ACPI_PPTT + help + EDAC device for the firmware-first BIOS detected CPU error count reported + via ACPI APEI/GHES. By enabling this option, EDAC device for the CPU + hierarchy and EDAC device blocks for caches hierarchy would be created. + The cpu error count is shared with the userspace via the CPU EDAC + device's sysfs interface. + config EDAC_AMD64 tristate "AMD64 (Opteron, Athlon64)" depends on AMD_NB && EDAC_DECODE_MCE diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index a918ca93e4f7..96619483e5f3 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -12,6 +12,9 @@ #include <acpi/ghes.h> #include <linux/edac.h> #include <linux/dmi.h> +#if defined(CONFIG_EDAC_GHES_CPU_ERROR) +#include <linux/cacheinfo.h> +#endif #include "edac_module.h" #include <ras/ras_event.h> @@ -497,6 +500,130 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) spin_unlock_irqrestore(&ghes_lock, flags); } +#if defined(CONFIG_EDAC_GHES_CPU_ERROR) +#define MAX_NUM_CACHES 20 +static struct ghes_edac_cpu_block { + int cpu; + u8 level; + u8 type; + int block_nr; +} *cpu_edac_block_list; + +static struct edac_device_ctl_info *cpu_edac_dev; +static int max_number_of_caches; + +void ghes_edac_report_cpu_error(struct ghes_einfo_cpu *einfo) +{ + struct ghes_edac_cpu_block *block; + int i; + + if (!einfo || !(einfo->ce_count) || !max_number_of_caches) + return; + + for (i = 0; i < max_number_of_caches; i++) { + block = cpu_edac_block_list + (einfo->cpu * max_number_of_caches) + i; + if ((block->level == einfo->cache_level) && (block->type == einfo->cache_type)) { + edac_device_handle_ce_count(cpu_edac_dev, einfo->ce_count, + einfo->cpu, block->block_nr, ""); + break; + } + } +} + +static int ghes_edac_add_cpu_device(struct device *dev) +{ + int rc; + + cpu_edac_dev = edac_device_alloc_ctl_info(0, "cpu", num_possible_cpus(), + "cache", max_number_of_caches, 0, NULL, + 0, edac_device_alloc_index()); + if (!cpu_edac_dev) { + pr_warn("edac_device_alloc_ctl_info for cpu_edac_dev failed\n"); + return -ENOMEM; + } + + cpu_edac_dev->dev = dev; + cpu_edac_dev->ctl_name = "cpu_edac_dev"; + cpu_edac_dev->dev_name = "ghes"; + cpu_edac_dev->mod_name = "ghes_edac.c"; + rc = edac_device_add_device(cpu_edac_dev); + if (rc) { + pr_warn("edac_device_add_device failed\n"); + edac_device_free_ctl_info(cpu_edac_dev); + return rc; + } + + return 0; +} + +static void ghes_edac_delete_cpu_device(void) +{ + max_number_of_caches = 0; + if (cpu_edac_dev) { + edac_device_del_device(cpu_edac_dev->dev); + edac_device_free_ctl_info(cpu_edac_dev); + } + vfree(cpu_edac_block_list); +} + +static void ghes_edac_create_cpu_device(struct device *dev) +{ + int cpu, i; + struct ghes_edac_cpu_block *block; + int number_of_caches; + struct acpi_cacheinfo cacheinfo[MAX_NUM_CACHES]; + + /* Find the maximum number of caches present in the cpu heirarchy among the CPUs */ + for_each_possible_cpu(cpu) { + number_of_caches = acpi_find_cache_info(cpu, &cacheinfo[0], MAX_NUM_CACHES); + if (number_of_caches <= 0) + return; + + if (max_number_of_caches < number_of_caches) + max_number_of_caches = number_of_caches; + } + if (!max_number_of_caches) + return; + + /* + * EDAC device interface supports creating the CPU hierarchy for all the CPUs + * together. Thus need to allocate cpu_edac_block_list for the max_number_of_caches + * among all the CPU hierarchy irrespective of the number of caches per CPU might vary. + */ + cpu_edac_block_list = vzalloc(num_possible_cpus() * max_number_of_caches * + sizeof(*cpu_edac_block_list)); + if (!cpu_edac_block_list) + return; + + if (ghes_edac_add_cpu_device(dev)) + goto error; + + for_each_possible_cpu(cpu) { + memset(cacheinfo, 0, MAX_NUM_CACHES * sizeof(struct acpi_cacheinfo)); + number_of_caches = acpi_find_cache_info(cpu, &cacheinfo[0], MAX_NUM_CACHES); + if (number_of_caches <= 0) + goto error; + /* + * The edac cpu cache device blocks entries in the sysfs should match with the cpu + * cache structure in the sysfs so that the affected cpus for a shared cache + * can be easily extracted in the userspace. + */ + for (i = 0; i < number_of_caches; i++) { + block = cpu_edac_block_list + (cpu * max_number_of_caches) + i; + block->cpu = cpu; + block->level = cacheinfo[i].level; + block->type = cacheinfo[i].type; + block->block_nr = i; + } + } + + return; + +error: + ghes_edac_delete_cpu_device(); +} +#endif + /* * Known systems that are safe to enable this module. */ @@ -624,6 +751,10 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) ghes_pvt = pvt; spin_unlock_irqrestore(&ghes_lock, flags); +#if defined(CONFIG_EDAC_GHES_CPU_ERROR) + ghes_edac_create_cpu_device(dev); +#endif + /* only set on success */ refcount_set(&ghes_refcount, 1); @@ -654,6 +785,10 @@ void ghes_edac_unregister(struct ghes *ghes) if (!refcount_dec_and_test(&ghes_refcount)) goto unlock; +#if defined(CONFIG_EDAC_GHES_CPU_ERROR) + ghes_edac_delete_cpu_device(); +#endif + /* * Wait for the irq handler being finished. */ diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index 34fb3431a8f3..a9098daf53d4 100644 --- a/include/acpi/ghes.h +++ b/include/acpi/ghes.h @@ -73,6 +73,24 @@ void ghes_unregister_vendor_record_notifier(struct notifier_block *nb); int ghes_estatus_pool_init(int num_ghes); +/* + * struct ghes_einfo_cpu - structure to pass cpu error info to the edac + * @cpu: CPU index. + * @error_type: error type, cache/TLB/bus/ etc. + * @cache_level: cache level. + * @cache_type: ACPI cache type. + * @ue_count: CPU uncorrectable error count. + * @ce_count: CPU correctable error count. + */ +struct ghes_einfo_cpu { + int cpu; + u8 error_type; + u8 cache_level; + u8 cache_type; + u16 ue_count; + u16 ce_count; +}; + /* From drivers/edac/ghes_edac.c */ #ifdef CONFIG_EDAC_GHES @@ -98,6 +116,15 @@ static inline void ghes_edac_unregister(struct ghes *ghes) } #endif +#ifdef CONFIG_EDAC_GHES_CPU_ERROR +void ghes_edac_report_cpu_error(struct ghes_einfo_cpu *einfo_cpu); + +#else +static inline void ghes_edac_report_cpu_error(struct ghes_einfo_cpu *einfo_cpu) +{ +} +#endif + static inline int acpi_hest_get_version(struct acpi_hest_generic_data *gdata) { return gdata->revision >> 8; -- 2.17.1