From: Qiuxu Zhuo <qiuxu.zhuo@xxxxxxxxx> Current skx_edac driver doesn't support address translation for non-volatile DIMMs. The ACPI ADXL DSM method support address translation for both volatile DIMMs and non-volatile DIMMs. So switch skx_edac to use the wrapped ACPI DSM methods, if they are supported and there are non-volatile DIMMs populated on the system. Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@xxxxxxxxx> Signed-off-by: Tony Luck <tony.luck@xxxxxxxxx> --- drivers/edac/Kconfig | 1 + drivers/edac/skx_edac.c | 191 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 179 insertions(+), 13 deletions(-) diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 57304b2e989f..ffd349c12479 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -234,6 +234,7 @@ config EDAC_SKX depends on PCI && X86_64 && X86_MCE_INTEL && PCI_MMCONFIG depends on ACPI_NFIT || !ACPI_NFIT # if ACPI_NFIT=m, EDAC_SKX can't be y select DMI + select ACPI_ADXL help Support for error detection and correction the Intel Skylake server Integrated Memory Controllers. If your diff --git a/drivers/edac/skx_edac.c b/drivers/edac/skx_edac.c index a710169abdbc..598564f9c4b2 100644 --- a/drivers/edac/skx_edac.c +++ b/drivers/edac/skx_edac.c @@ -26,6 +26,7 @@ #include <linux/bitmap.h> #include <linux/math64.h> #include <linux/mod_devicetable.h> +#include <linux/adxl.h> #include <acpi/nfit.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> @@ -35,6 +36,7 @@ #include "edac_module.h" #define EDAC_MOD_STR "skx_edac" +#define MSG_SIZE 1024 /* * Debug macros @@ -54,6 +56,26 @@ static LIST_HEAD(skx_edac_list); static u64 skx_tolm, skx_tohm; +static int nvdimm_count; +static u64 *adxl_values; + +enum { + INDEX_SOCKET, + INDEX_MEMCTRL, + INDEX_CHANNEL, + INDEX_DIMM, + INDEX_MAX +}; +static char component_names[][32] = { + [INDEX_SOCKET] = "ProcessorSocketId", + [INDEX_MEMCTRL] = "MemoryControllerId", + [INDEX_CHANNEL] = "ChannelId", + [INDEX_DIMM] = "DimmSlotId", +}; + +static int component_indices[ARRAY_SIZE(component_names)]; +static int adxl_component_count; +static const char * const *adxl_component_names; #define NUM_IMC 2 /* memory controllers per socket */ #define NUM_CHANNELS 3 /* channels per memory controller */ @@ -102,11 +124,11 @@ struct decoded_addr { u64 addr; int socket; int imc; - int channel; + u64 channel; u64 chan_addr; int sktways; int chanways; - int dimm; + u64 dimm; int rank; int channel_rank; u64 rank_address; @@ -393,6 +415,8 @@ static int get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc, u16 flags; u64 size = 0; + nvdimm_count++; + dev_handle = ACPI_NFIT_BUILD_DEVICE_HANDLE(dimmno, chan, imc->lmc, imc->src_id, 0); @@ -682,7 +706,7 @@ static bool skx_sad_decode(struct decoded_addr *res) res->imc = GET_BITFIELD(d->mcroute, lchan * 3, lchan * 3 + 2); res->channel = GET_BITFIELD(d->mcroute, lchan * 2 + 18, lchan * 2 + 19); - edac_dbg(2, "%llx: socket=%d imc=%d channel=%d\n", + edac_dbg(2, "%llx: socket=%d imc=%d channel=%llu\n", res->addr, res->socket, res->imc, res->channel); return true; } @@ -818,7 +842,7 @@ static bool skx_rir_decode(struct decoded_addr *res) res->dimm = chan_rank / 4; res->rank = chan_rank % 4; - edac_dbg(2, "%llx: dimm=%d rank=%d chan_rank=%d rank_addr=%llx\n", + edac_dbg(2, "%llx: dimm=%llu rank=%d chan_rank=%d rank_addr=%llx\n", res->addr, res->dimm, res->rank, res->channel_rank, res->rank_address); return true; @@ -894,12 +918,48 @@ static bool skx_decode(struct decoded_addr *res) skx_rir_decode(res) && skx_mad_decode(res); } +static bool skx_dsm_decode(u64 addr, char *msg, int msglen, + u64 *sock, u64 *imc, + u64 *chan, u64 *dimm) + +{ + int i, len = 0; + + if (addr >= skx_tohm || (addr >= skx_tolm && addr < BIT_ULL(32))) { + edac_dbg(0, "Address %llx out of range\n", addr); + return false; + } + + if (adxl_decode(addr, adxl_values)) { + edac_dbg(0, "Failed to decode 0x%llx\n", addr); + return false; + } + + *sock = adxl_values[component_indices[INDEX_SOCKET]]; + *imc = adxl_values[component_indices[INDEX_MEMCTRL]]; + *chan = adxl_values[component_indices[INDEX_CHANNEL]]; + *dimm = adxl_values[component_indices[INDEX_DIMM]]; + + for (i = 0; i < adxl_component_count; i++) { + if (adxl_values[i] == ~0x0ull) + continue; + len += snprintf(msg + len, msglen, " %s:0x%llx", + adxl_component_names[i], adxl_values[i]); + + if (msglen - len <= 0) + break; + } + + return true; +} + static void skx_mce_output_error(struct mem_ctl_info *mci, const struct mce *m, - struct decoded_addr *res) + struct decoded_addr *res, + char *msg) { enum hw_event_mc_err_type tp_event; - char *type, *optype, msg[256]; + char *type, *optype; bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0); bool overflow = GET_BITFIELD(m->status, 62, 62); bool uncorrected_error = GET_BITFIELD(m->status, 61, 61); @@ -960,13 +1020,11 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, } } - snprintf(msg, sizeof(msg), - "%s%s err_code:%04x:%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:%x col:%x", + snprintf(msg, MSG_SIZE, + "%s%s err_code:%04x:%04x %s", overflow ? " OVERFLOW" : "", (uncorrected_error && recoverable) ? " recoverable" : "", - mscod, errcode, - res->socket, res->imc, res->rank, - res->bank_group, res->bank_address, res->row, res->column); + mscod, errcode, msg + MSG_SIZE); edac_dbg(0, "%s\n", msg); @@ -977,13 +1035,33 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, optype, msg); } +static struct mem_ctl_info *get_mci(u64 src_id, u64 lmc) +{ + struct skx_dev *d; + + if (lmc > NUM_IMC - 1) { + skx_printk(KERN_ERR, "Bad mc# 0x%llx\n", lmc); + return NULL; + } + + list_for_each_entry(d, &skx_edac_list, list) { + if (d->imc[0].src_id == src_id) + return d->imc[lmc].mci; + } + + skx_printk(KERN_ERR, "No mci for src_id %llx lmc %llx\n", src_id, lmc); + + return NULL; +} + static int skx_mce_check_error(struct notifier_block *nb, unsigned long val, void *data) { struct mce *mce = (struct mce *)data; struct decoded_addr res; struct mem_ctl_info *mci; - char *type; + u64 socket, memctrl; + char *type, *msg; if (edac_get_report_status() == EDAC_REPORTING_DISABLED) return NOTIFY_DONE; @@ -992,11 +1070,35 @@ static int skx_mce_check_error(struct notifier_block *nb, unsigned long val, if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV)) return NOTIFY_DONE; + msg = kzalloc(MSG_SIZE * 2, GFP_KERNEL); + if (!msg) + return NOTIFY_DONE; + + if (adxl_component_count) + goto dsm_decoding; + res.addr = mce->addr; if (!skx_decode(&res)) return NOTIFY_DONE; mci = res.dev->imc[res.imc].mci; + snprintf(msg + MSG_SIZE, MSG_SIZE, + "socket:%d imc:%d chan:%llu dimm:%llu rank:%d bg:%d ba:%d row:%x col:%x", + res.socket, res.imc, res.channel, res.dimm, res.rank, + res.bank_group, res.bank_address, res.row, res.column); + + goto decoded; + +dsm_decoding: + if (!skx_dsm_decode(mce->addr, msg + MSG_SIZE, MSG_SIZE, + &socket, &memctrl, &res.channel, &res.dimm)) + goto out; + + mci = get_mci(socket, memctrl); + if (!mci) + goto out; + +decoded: if (mce->mcgstatus & MCG_STATUS_MCIP) type = "Exception"; else @@ -1015,8 +1117,10 @@ static int skx_mce_check_error(struct notifier_block *nb, unsigned long val, "%u APIC %x\n", mce->cpuvendor, mce->cpuid, mce->time, mce->socketid, mce->apicid); - skx_mce_output_error(mci, mce, &res); + skx_mce_output_error(mci, mce, &res, msg); +out: + kfree(msg); return NOTIFY_DONE; } @@ -1090,6 +1194,54 @@ static void skx_remove(void) } } +static void __init skx_dsm_get(void) +{ + const char * const *names; + int i, j; + + names = adxl_get_component_names(); + if (!names) { + skx_printk(KERN_NOTICE, "No firmware support for address translation."); + skx_printk(KERN_CONT, " Only decoding DDR4 address!\n"); + return; + } + + for (i = 0; i < INDEX_MAX; i++) { + for (j = 0; names[j]; j++) { + if (!strcmp(component_names[i], names[j])) { + component_indices[i] = j; + break; + } + } + + if (!names[j]) + goto err; + } + + adxl_component_names = names; + while (*names++) + adxl_component_count++; + + adxl_values = kcalloc(adxl_component_count, sizeof(*adxl_values), GFP_KERNEL); + if (!adxl_values) { + adxl_component_count = 0; + edac_dbg(0, "No memory for adxl_decode()\n"); + } + + return; +err: + skx_printk(KERN_ERR, "'%s' is not matched from DSM parameters: ", + component_names[i]); + for (j = 0; names[j]; j++) + skx_printk(KERN_CONT, "%s ", names[j]); + skx_printk(KERN_CONT, "\n"); +} + +static void __exit skx_dsm_put(void) +{ + kfree(adxl_values); +} + /* * skx_init: * make sure we are running on the correct cpu model @@ -1154,6 +1306,9 @@ static int __init skx_init(void) } } + if (nvdimm_count) + skx_dsm_get(); + /* Ensure that the OPSTATE is set correctly for POLL or NMI */ opstate_init(); @@ -1170,6 +1325,8 @@ static int __init skx_init(void) static void __exit skx_exit(void) { edac_dbg(2, "\n"); + if (nvdimm_count) + skx_dsm_put(); mce_unregister_decode_chain(&skx_mce_dec); skx_remove(); teardown_skx_debug(); @@ -1180,6 +1337,14 @@ module_exit(skx_exit); module_param(edac_op_state, int, 0444); MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI"); +module_param_string(sock, component_names[0], sizeof(component_names[0]), 0644); +MODULE_PARM_DESC(sock, "String to get socket ID"); +module_param_string(imc, component_names[1], sizeof(component_names[1]), 0644); +MODULE_PARM_DESC(imc, "String to get IMC ID"); +module_param_string(chan, component_names[2], sizeof(component_names[2]), 0644); +MODULE_PARM_DESC(chan, "String to get channel ID"); +module_param_string(dimm, component_names[3], sizeof(component_names[3]), 0644); +MODULE_PARM_DESC(dimm, "String to get DIMM ID"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Tony Luck"); -- 2.17.1