[PATCH 3/3] EDAC, skx_edac: Add address translation for non-volatile DIMMs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Qiuxu Zhuo <qiuxu.zhuo@xxxxxxxxx>

Current skx_edac driver doesn't support address translation for
non-volatile DIMMs.

The ACPI ADXL DSM method support address translation for both
volatile DIMMs and non-volatile DIMMs. So switch skx_edac to use
the wrapped ACPI DSM methods, if they are supported and there are
non-volatile DIMMs populated on the system.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@xxxxxxxxx>
Signed-off-by: Tony Luck <tony.luck@xxxxxxxxx>
---
 drivers/edac/Kconfig    |   1 +
 drivers/edac/skx_edac.c | 177 +++++++++++++++++++++++++++++++++++++---
 2 files changed, 165 insertions(+), 13 deletions(-)

diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 57304b2e989f..ffd349c12479 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -234,6 +234,7 @@ config EDAC_SKX
 	depends on PCI && X86_64 && X86_MCE_INTEL && PCI_MMCONFIG
 	depends on ACPI_NFIT || !ACPI_NFIT # if ACPI_NFIT=m, EDAC_SKX can't be y
 	select DMI
+	select ACPI_ADXL
 	help
 	  Support for error detection and correction the Intel
 	  Skylake server Integrated Memory Controllers. If your
diff --git a/drivers/edac/skx_edac.c b/drivers/edac/skx_edac.c
index a710169abdbc..2989ebad8970 100644
--- a/drivers/edac/skx_edac.c
+++ b/drivers/edac/skx_edac.c
@@ -26,6 +26,7 @@
 #include <linux/bitmap.h>
 #include <linux/math64.h>
 #include <linux/mod_devicetable.h>
+#include <linux/adxl.h>
 #include <acpi/nfit.h>
 #include <asm/cpu_device_id.h>
 #include <asm/intel-family.h>
@@ -35,6 +36,7 @@
 #include "edac_module.h"
 
 #define EDAC_MOD_STR    "skx_edac"
+#define MSG_SIZE	1024
 
 /*
  * Debug macros
@@ -54,6 +56,17 @@
 static LIST_HEAD(skx_edac_list);
 
 static u64 skx_tolm, skx_tohm;
+static int nvdimm_count;
+static char component_names[][32] = {
+	"ProcessorSocketId",
+	"MemoryControllerId",
+	"ChannelId",
+	"DimmSlotId",
+};
+
+static int component_indices[ARRAY_SIZE(component_names)];
+static int adxl_component_count;
+static const char * const *adxl_component_names;
 
 #define NUM_IMC			2	/* memory controllers per socket */
 #define NUM_CHANNELS		3	/* channels per memory controller */
@@ -102,11 +115,11 @@ struct decoded_addr {
 	u64	addr;
 	int	socket;
 	int	imc;
-	int	channel;
+	u64	channel;
 	u64	chan_addr;
 	int	sktways;
 	int	chanways;
-	int	dimm;
+	u64	dimm;
 	int	rank;
 	int	channel_rank;
 	u64	rank_address;
@@ -393,6 +406,8 @@ static int get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc,
 	u16 flags;
 	u64 size = 0;
 
+	nvdimm_count++;
+
 	dev_handle = ACPI_NFIT_BUILD_DEVICE_HANDLE(dimmno, chan, imc->lmc,
 						   imc->src_id, 0);
 
@@ -682,7 +697,7 @@ static bool skx_sad_decode(struct decoded_addr *res)
 	res->imc = GET_BITFIELD(d->mcroute, lchan * 3, lchan * 3 + 2);
 	res->channel = GET_BITFIELD(d->mcroute, lchan * 2 + 18, lchan * 2 + 19);
 
-	edac_dbg(2, "%llx: socket=%d imc=%d channel=%d\n",
+	edac_dbg(2, "%llx: socket=%d imc=%d channel=%llu\n",
 		 res->addr, res->socket, res->imc, res->channel);
 	return true;
 }
@@ -818,7 +833,7 @@ static bool skx_rir_decode(struct decoded_addr *res)
 	res->dimm = chan_rank / 4;
 	res->rank = chan_rank % 4;
 
-	edac_dbg(2, "%llx: dimm=%d rank=%d chan_rank=%d rank_addr=%llx\n",
+	edac_dbg(2, "%llx: dimm=%llu rank=%d chan_rank=%d rank_addr=%llx\n",
 		 res->addr, res->dimm, res->rank,
 		 res->channel_rank, res->rank_address);
 	return true;
@@ -894,12 +909,55 @@ static bool skx_decode(struct decoded_addr *res)
 		skx_rir_decode(res) && skx_mad_decode(res);
 }
 
+static bool skx_dsm_decode(u64 addr, char *msg, int msglen,
+			   u64 *sock, u64 *imc,
+			   u64 *chan, u64 *dimm)
+
+{
+	u64 *values;
+	int i, len = 0;
+
+	if (addr >= skx_tohm || (addr >= skx_tolm && addr < BIT_ULL(32))) {
+		edac_dbg(0, "Address %llx out of range\n", addr);
+		return false;
+	}
+
+	values = kcalloc(adxl_component_count, sizeof(*values), GFP_KERNEL);
+	if (!values) {
+		edac_dbg(0, "Out of memory\n");
+		return false;
+	}
+
+	if (adxl_decode(addr, values)) {
+		edac_dbg(0, "Failed to decode 0x%llx\n", addr);
+		kfree(values);
+		return false;
+	}
+
+	*sock = values[component_indices[0]];
+	*imc = values[component_indices[1]];
+	*chan = values[component_indices[2]];
+	*dimm = values[component_indices[3]];
+
+	for (i = 0; i < adxl_component_count; i++) {
+		len += snprintf(msg + len, msglen, " %s:0x%llx",
+				adxl_component_names[i], values[i]);
+
+		if (msglen - len <= 0)
+			break;
+	}
+
+	kfree(values);
+	return true;
+}
+
 static void skx_mce_output_error(struct mem_ctl_info *mci,
 				 const struct mce *m,
-				 struct decoded_addr *res)
+				 struct decoded_addr *res,
+				 char *msg)
 {
 	enum hw_event_mc_err_type tp_event;
-	char *type, *optype, msg[256];
+	char *type, *optype;
 	bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0);
 	bool overflow = GET_BITFIELD(m->status, 62, 62);
 	bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
@@ -960,13 +1018,11 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 		}
 	}
 
-	snprintf(msg, sizeof(msg),
-		 "%s%s err_code:%04x:%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:%x col:%x",
+	snprintf(msg, MSG_SIZE,
+		 "%s%s err_code:%04x:%04x %s",
 		 overflow ? " OVERFLOW" : "",
 		 (uncorrected_error && recoverable) ? " recoverable" : "",
-		 mscod, errcode,
-		 res->socket, res->imc, res->rank,
-		 res->bank_group, res->bank_address, res->row, res->column);
+		 mscod, errcode, msg + MSG_SIZE);
 
 	edac_dbg(0, "%s\n", msg);
 
@@ -977,13 +1033,33 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 			     optype, msg);
 }
 
+static struct mem_ctl_info *get_mci(u64 src_id, u64 lmc)
+{
+	struct skx_dev *d;
+
+	if (lmc > NUM_IMC - 1) {
+		skx_printk(KERN_ERR, "Bad mc# 0x%llx\n", lmc);
+		return NULL;
+	}
+
+	list_for_each_entry(d, &skx_edac_list, list) {
+		if (d->imc[0].src_id == src_id)
+			return d->imc[lmc].mci;
+	}
+
+	skx_printk(KERN_ERR, "No mci for src_id %llx lmc %llx\n", src_id, lmc);
+
+	return NULL;
+}
+
 static int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 			       void *data)
 {
 	struct mce *mce = (struct mce *)data;
 	struct decoded_addr res;
 	struct mem_ctl_info *mci;
-	char *type;
+	u64 socket, memctrl;
+	char *type, *msg;
 
 	if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
 		return NOTIFY_DONE;
@@ -992,11 +1068,35 @@ static int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 	if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV))
 		return NOTIFY_DONE;
 
+	msg = kzalloc(MSG_SIZE * 2, GFP_KERNEL);
+	if (!msg)
+		return NOTIFY_DONE;
+
+	if (adxl_component_count)
+		goto dsm_decoding;
+
 	res.addr = mce->addr;
 	if (!skx_decode(&res))
 		return NOTIFY_DONE;
 	mci = res.dev->imc[res.imc].mci;
 
+	snprintf(msg + MSG_SIZE, MSG_SIZE,
+		 "socket:%d imc:%d chan:%llu dimm:%llu rank:%d bg:%d ba:%d row:%x col:%x",
+		 res.socket, res.imc, res.channel, res.dimm, res.rank,
+		 res.bank_group, res.bank_address, res.row, res.column);
+
+	goto decoded;
+
+dsm_decoding:
+	if (!skx_dsm_decode(mce->addr, msg + MSG_SIZE, MSG_SIZE,
+			    &socket, &memctrl, &res.channel, &res.dimm))
+		goto out;
+
+	mci = get_mci(socket, memctrl);
+	if (!mci)
+		goto out;
+
+decoded:
 	if (mce->mcgstatus & MCG_STATUS_MCIP)
 		type = "Exception";
 	else
@@ -1015,8 +1115,10 @@ static int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 			  "%u APIC %x\n", mce->cpuvendor, mce->cpuid,
 			  mce->time, mce->socketid, mce->apicid);
 
-	skx_mce_output_error(mci, mce, &res);
+	skx_mce_output_error(mci, mce, &res, msg);
 
+out:
+	kfree(msg);
 	return NOTIFY_DONE;
 }
 
@@ -1090,6 +1192,44 @@ static void skx_remove(void)
 	}
 }
 
+static void __init skx_dsm_get(void)
+{
+	const char * const *names;
+	int i, j, n;
+
+	names = adxl_get_component_names();
+	if (!names) {
+		skx_printk(KERN_NOTICE, "No firmware supports for address translation.");
+		skx_printk(KERN_CONT, " Only decoding DDR4 address!\n");
+		return;
+	}
+
+	n = ARRAY_SIZE(component_names);
+	for (i = 0; i < n; i++) {
+		for (j = 0; names[j]; j++) {
+			if (!strcmp(component_names[i], names[j])) {
+				component_indices[i] = j;
+				break;
+			}
+		}
+
+		if (!names[j])
+			goto err;
+	}
+
+	adxl_component_names = names;
+	while (*names++)
+		adxl_component_count++;
+
+	return;
+err:
+	skx_printk(KERN_ERR, "'%s' is not matched from DSM parameters: ",
+		   component_names[i]);
+	for (j = 0; names[j]; j++)
+		skx_printk(KERN_CONT, "%s ", names[j]);
+	skx_printk(KERN_CONT, "\n");
+}
+
 /*
  * skx_init:
  *	make sure we are running on the correct cpu model
@@ -1154,6 +1294,9 @@ static int __init skx_init(void)
 		}
 	}
 
+	if (nvdimm_count)
+		skx_dsm_get();
+
 	/* Ensure that the OPSTATE is set correctly for POLL or NMI */
 	opstate_init();
 
@@ -1180,6 +1323,14 @@ module_exit(skx_exit);
 
 module_param(edac_op_state, int, 0444);
 MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
+module_param_string(sock, component_names[0], sizeof(component_names[0]), 0644);
+MODULE_PARM_DESC(sock, "String to get socket ID");
+module_param_string(imc, component_names[1], sizeof(component_names[1]), 0644);
+MODULE_PARM_DESC(imc, "String to get IMC ID");
+module_param_string(chan, component_names[2], sizeof(component_names[2]), 0644);
+MODULE_PARM_DESC(chan, "String to get channel ID");
+module_param_string(dimm, component_names[3], sizeof(component_names[3]), 0644);
+MODULE_PARM_DESC(dimm, "String to get DIMM ID");
 
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Tony Luck");
-- 
2.17.1




[Index of Archives]     [Linux IBM ACPI]     [Linux Power Management]     [Linux Kernel]     [Linux Laptop]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Video 4 Linux]     [Device Mapper]     [Linux Resources]

  Powered by Linux