On Wed, 25 Sep 2024 10:40:38 +0800 Zhao Qunqin <zhaoqunqin@xxxxxxxxxxx> wrote: > Reports single bit errors (CE) only. > > Signed-off-by: Zhao Qunqin <zhaoqunqin@xxxxxxxxxxx> Hi. A few quick comments inline Jonathan > --- > Changes in v5: > - Drop the loongson_ prefix from all static functions. > - Align function arguments on the opening brace. > - Drop useless comments and useless wrapper. Drop side comments. > - Reorder variable declarations. > > Changes in v4: > - None > > Changes in v3: > - Addressed review comments raised by Krzysztof and Huacai > > Changes in v2: > - Addressed review comments raised by Krzysztof > > MAINTAINERS | 1 + > arch/loongarch/Kconfig | 1 + > drivers/edac/Kconfig | 8 ++ > drivers/edac/Makefile | 1 + > drivers/edac/loongson_edac.c | 168 +++++++++++++++++++++++++++++++++++ > 5 files changed, 179 insertions(+) > create mode 100644 drivers/edac/loongson_edac.c > > diff --git a/MAINTAINERS b/MAINTAINERS > index 6cc8cfc8f..5b4526638 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -13242,6 +13242,7 @@ M: Zhao Qunqin <zhaoqunqin@xxxxxxxxxxx> > L: linux-edac@xxxxxxxxxxxxxxx > S: Maintained > F: Documentation/devicetree/bindings/edac/loongson,ls3a5000-mc-edac.yaml > +F: drivers/edac/loongson_edac.c > > LSILOGIC MPT FUSION DRIVERS (FC/SAS/SPI) > M: Sathya Prakash <sathya.prakash@xxxxxxxxxxxx> > diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig > index 70f169210..9c135f1a2 100644 > --- a/arch/loongarch/Kconfig > +++ b/arch/loongarch/Kconfig > @@ -181,6 +181,7 @@ config LOONGARCH > select PCI_MSI_ARCH_FALLBACKS > select PCI_QUIRKS > select PERF_USE_VMALLOC > + select EDAC_SUPPORT > select RTC_LIB > select SPARSE_IRQ > select SYSCTL_ARCH_UNALIGN_ALLOW > diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig > index 81af6c344..719bb6ca7 100644 > --- a/drivers/edac/Kconfig > +++ b/drivers/edac/Kconfig > @@ -564,5 +564,13 @@ config EDAC_VERSAL > Support injecting both correctable and uncorrectable errors > for debugging purposes. > > +config EDAC_LOONGSON3 > + tristate "Loongson-3 Memory Controller" > + depends on LOONGARCH || COMPILE_TEST > + help > + Support for error detection and correction on the Loongson-3 > + family memory controller. This driver reports single bit > + errors (CE) only. Loongson-3A5000/3C5000/3D5000/3C5000L/3A6000/3C6000 > + are compatible. > > endif # EDAC > diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile > index faf310eec..e72ca1be4 100644 > --- a/drivers/edac/Makefile > +++ b/drivers/edac/Makefile > @@ -88,3 +88,4 @@ obj-$(CONFIG_EDAC_DMC520) += dmc520_edac.o > obj-$(CONFIG_EDAC_NPCM) += npcm_edac.o > obj-$(CONFIG_EDAC_ZYNQMP) += zynqmp_edac.o > obj-$(CONFIG_EDAC_VERSAL) += versal_edac.o > +obj-$(CONFIG_EDAC_LOONGSON3) += loongson_edac.o > diff --git a/drivers/edac/loongson_edac.c b/drivers/edac/loongson_edac.c > new file mode 100644 > index 000000000..2721dfba5 > --- /dev/null > +++ b/drivers/edac/loongson_edac.c > @@ -0,0 +1,168 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Copyright (C) 2024 Loongson Technology Corporation Limited. > + */ > + > +#include <linux/edac.h> > +#include <linux/module.h> > +#include <linux/init.h> > +#include <linux/platform_device.h> > + > +#include "edac_module.h" > + > +enum ecc_index { > + ECC_SET = 0, > + ECC_RESERVED, > + ECC_COUNT, > + ECC_CS_COUNT, > + ECC_CODE, > + ECC_ADDR, > + ECC_DATA0, > + ECC_DATA1, > + ECC_DATA2, > + ECC_DATA3, > +}; > + > +struct loongson_edac_pvt { > + u64 *ecc_base; > + int last_ce_count; > +}; > + > +static int read_ecc(struct mem_ctl_info *mci) > +{ > + struct loongson_edac_pvt *pvt = mci->pvt_info; > + u64 ecc; > + int cs; > + > + if (!pvt->ecc_base) > + return pvt->last_ce_count; > + > + ecc = pvt->ecc_base[ECC_CS_COUNT]; > + /* cs0 -- cs3 */ > + cs = ecc & 0xff; > + cs += (ecc >> 8) & 0xff; > + cs += (ecc >> 16) & 0xff; > + cs += (ecc >> 24) & 0xff; This smells like an endian swap. swab32() or is this fixing a wrong endian register? In which case b32_to_cpu() > + > + return cs; > +} > + > +static void edac_check(struct mem_ctl_info *mci) > +{ > + struct loongson_edac_pvt *pvt = mci->pvt_info; > + int new, add; > + > + new = read_ecc(mci); > + add = new - pvt->last_ce_count; > + pvt->last_ce_count = new; > + if (add <= 0) This has be a little confused. Either this counter can wrap in which case why drop out here, or it can't in which case does < occur? > + return; > + > + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, add, > + 0, 0, 0, 0, 0, -1, "error", ""); > + edac_mc_printk(mci, KERN_INFO, "add: %d", add); > +} > + > +static int get_dimm_config(struct mem_ctl_info *mci) > +{ > + struct dimm_info *dimm; > + u32 size, npages; > + > + /* size not used */ > + size = -1; > + npages = MiB_TO_PAGES(size); > + > + dimm = edac_get_dimm(mci, 0, 0, 0); > + dimm->nr_pages = npages; > + snprintf(dimm->label, sizeof(dimm->label), > + "MC#%uChannel#%u_DIMM#%u", mci->mc_idx, 0, 0); > + dimm->grain = 8; > + > + return 0; > +} > + > +static void pvt_init(struct mem_ctl_info *mci, u64 *vbase) > +{ > + struct loongson_edac_pvt *pvt = mci->pvt_info; > + > + pvt->ecc_base = vbase; > + pvt->last_ce_count = read_ecc(mci); > +} > + > +static int edac_probe(struct platform_device *pdev) > +{ > + struct edac_mc_layer layers[2]; > + struct loongson_edac_pvt *pvt; > + struct mem_ctl_info *mci; > + u64 *vbase; > + int ret; > + > + vbase = devm_platform_ioremap_resource(pdev, 0); > + if (IS_ERR(vbase)) > + return PTR_ERR(vbase); > + > + /* allocate a new MC control structure */ > + layers[0].type = EDAC_MC_LAYER_CHANNEL; > + layers[0].size = 1; > + layers[0].is_virt_csrow = false; > + layers[1].type = EDAC_MC_LAYER_SLOT; > + layers[1].size = 1; > + layers[1].is_virt_csrow = true; Could move this to a c99 style struct edac_mc_layer layers[2] = { { .type = EDAC_MC_LAYER_CHANNEL, .size = 1, .is_virt_csrow = false, }, { .type = EDAC_MC_LAYER_SLOT, .size = 1, is_virt_csrow = true, } }; Not particularly important though. > + mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(*pvt)); > + if (mci == NULL) Probably !mci is sufficient but I'm not sure on local edac style. > + return -ENOMEM; > + > + mci->mc_idx = edac_device_alloc_index(); > + mci->mtype_cap = MEM_FLAG_RDDR4; > + mci->edac_ctl_cap = EDAC_FLAG_NONE; > + mci->edac_cap = EDAC_FLAG_NONE; > + mci->mod_name = "loongson_edac.c"; > + mci->ctl_name = "loongson_edac_ctl"; > + mci->dev_name = "loongson_edac_dev"; > + mci->ctl_page_to_phys = NULL; > + mci->pdev = &pdev->dev; > + mci->error_desc.grain = 8; > + /* Set the function pointer to an actual operation function */ > + mci->edac_check = edac_check; Similar to above, can initialize this structure more cleanly using *mci = (struct mem_ctl_info) { .mc_idx = edac_device_alloc_index, ... }; > + > + pvt_init(mci, vbase); > + get_dimm_config(mci); > + > + ret = edac_mc_add_mc(mci); I'd be tempted to use devm_add_action_or_cleanup() for this and the alloc above, but not common in edac but it is done in al_mc_edac.c if you want an example. > + if (ret) { > + edac_dbg(0, "MC: failed edac_mc_add_mc()\n"); > + edac_mc_free(mci); > + return ret; > + } > + edac_op_state = EDAC_OPSTATE_POLL; > + > + return 0; > +} > + > +static void edac_remove(struct platform_device *pdev) > +{ > + struct mem_ctl_info *mci = edac_mc_del_mc(&pdev->dev); > + > + if (mci) > + edac_mc_free(mci); Very odd if you got to remove and edac_mc_del_mc() failed. Do we need this check? At least some drivers (I checked a few at random) don't check this. > +} > + > +static const struct of_device_id loongson_edac_of_match[] = { > + { .compatible = "loongson,ls3a5000-mc-edac", }, > + {} > +}; > +MODULE_DEVICE_TABLE(of, loongson_edac_of_match); > + > +static struct platform_driver loongson_edac_driver = { > + .probe = edac_probe, > + .remove = edac_remove, > + .driver = { > + .name = "loongson-mc-edac", > + .of_match_table = loongson_edac_of_match, > + }, > +}; > +module_platform_driver(loongson_edac_driver); > + > +MODULE_LICENSE("GPL"); > +MODULE_AUTHOR("Zhao Qunqin <zhaoqunqin@xxxxxxxxxxx>"); > +MODULE_DESCRIPTION("EDAC driver for loongson memory controller");