Quoting Venkata Narendra Kumar Gutta (2018-08-17 17:08:34) > diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig > index 57304b2..da8f150 100644 > --- a/drivers/edac/Kconfig > +++ b/drivers/edac/Kconfig > @@ -460,4 +460,32 @@ config EDAC_TI > Support for error detection and correction on the > TI SoCs. > > +config EDAC_QCOM > + tristate "QCOM EDAC Controller" > + depends on EDAC > + help > + Support for error detection and correction on the > + QCOM SoCs. > + > +config EDAC_QCOM_LLCC > + tristate "QCOM EDAC Controller for LLCC Cache" > + depends on EDAC_QCOM && QCOM_LLCC > + help > + Support for error detection and correction on the > + QCOM LLCC cache. Report errors caught by LLCC ECC > + mechanism. > + > + For debugging issues having to do with stability and overall system > + health, you should probably say 'Y' here. > + > +config EDAC_QCOM_LLCC_PANIC_ON_UE > + bool "Panic on uncorrectable errors - qcom llcc" Why isn't this a generic option for all EDAC? > + depends on EDAC_QCOM_LLCC > + help > + Forcibly cause a kernel panic if an uncorrectable error (UE) is > + detected. This can reduce debugging times on hardware which may be > + operating at voltages or frequencies outside normal specification. > + > + For production builds, you should probably say 'N' here. > + > endif # EDAC > diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile > index 02b43a7..716096d 100644 > --- a/drivers/edac/Makefile > +++ b/drivers/edac/Makefile > @@ -77,3 +77,4 @@ obj-$(CONFIG_EDAC_ALTERA) += altera_edac.o > obj-$(CONFIG_EDAC_SYNOPSYS) += synopsys_edac.o > obj-$(CONFIG_EDAC_XGENE) += xgene_edac.o > obj-$(CONFIG_EDAC_TI) += ti_edac.o > +obj-$(CONFIG_EDAC_QCOM) += qcom_edac.o Maybe put this in sort of alphabetical order so conflicts don't happen. > diff --git a/drivers/edac/qcom_edac.c b/drivers/edac/qcom_edac.c > new file mode 100644 > index 0000000..9a8c670 > --- /dev/null > +++ b/drivers/edac/qcom_edac.c > @@ -0,0 +1,446 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Copyright (c) 2018, The Linux Foundation. All rights reserved. > + */ > + > +#include <linux/edac.h> > +#include <linux/interrupt.h> > +#include <linux/kernel.h> > +#include <linux/of_device.h> Used? Maybe it should just be linux/of.h > +#include <linux/platform_device.h> > +#include <linux/regmap.h> > +#include <linux/smp.h> Why? > +#include <linux/soc/qcom/llcc-qcom.h> > + > +#include "edac_mc.h" > +#include "edac_device.h" > + > +#ifdef CONFIG_EDAC_QCOM_LLCC_PANIC_ON_UE > +#define LLCC_ERP_PANIC_ON_UE 1 > +#else > +#define LLCC_ERP_PANIC_ON_UE 0 > +#endif > + > +#define EDAC_LLCC "qcom_llcc" > + > +#define TRP_SYN_REG_CNT 6 > + > +#define DRP_SYN_REG_CNT 8 > + > +#define LLCC_COMMON_STATUS0 0x0003000C > +#define LLCC_LB_CNT_MASK GENMASK(31, 28) > +#define LLCC_LB_CNT_SHIFT 28 > + > +/* single & Double Bit syndrome register offsets */ > +#define TRP_ECC_SB_ERR_SYN0 0x0002304C > +#define TRP_ECC_DB_ERR_SYN0 0x00020370 > +#define DRP_ECC_SB_ERR_SYN0 0x0004204C > +#define DRP_ECC_DB_ERR_SYN0 0x00042070 > + > +/* Error register offsets */ > +#define TRP_ECC_ERROR_STATUS1 0x00020348 > +#define TRP_ECC_ERROR_STATUS0 0x00020344 > +#define DRP_ECC_ERROR_STATUS1 0x00042048 > +#define DRP_ECC_ERROR_STATUS0 0x00042044 > + > +/* TRP, DRP interrupt register offsets */ > +#define DRP_INTERRUPT_STATUS 0x00041000 > +#define TRP_INTERRUPT_0_STATUS 0x00020480 > +#define DRP_INTERRUPT_CLEAR 0x00041008 > +#define DRP_ECC_ERROR_CNTR_CLEAR 0x00040004 > +#define TRP_INTERRUPT_0_CLEAR 0x00020484 > +#define TRP_ECC_ERROR_CNTR_CLEAR 0x00020440 > + > +/* Mask and shift macros */ > +#define ECC_DB_ERR_COUNT_MASK GENMASK(4, 0) > +#define ECC_DB_ERR_WAYS_MASK GENMASK(31, 16) > +#define ECC_DB_ERR_WAYS_SHIFT BIT(4) > + > +#define ECC_SB_ERR_COUNT_MASK GENMASK(23, 16) > +#define ECC_SB_ERR_COUNT_SHIFT BIT(4) > +#define ECC_SB_ERR_WAYS_MASK GENMASK(15, 0) > + > +#define SB_ECC_ERROR BIT(0) > +#define DB_ECC_ERROR BIT(1) > + > +#define DRP_TRP_INT_CLEAR GENMASK(1, 0) > +#define DRP_TRP_CNT_CLEAR GENMASK(1, 0) > + > +/* Config registers offsets*/ > +#define DRP_ECC_ERROR_CFG 0x00040000 > + > +/* TRP, DRP interrupt register offsets */ > +#define CMN_INTERRUPT_0_ENABLE 0x0003001C > +#define CMN_INTERRUPT_2_ENABLE 0x0003003C > +#define TRP_INTERRUPT_0_ENABLE 0x00020488 > +#define DRP_INTERRUPT_ENABLE 0x0004100C > + > +#define SB_ERROR_THRESHOLD 0x1 > +#define SB_ERROR_THRESHOLD_SHIFT 24 > +#define SB_DB_TRP_INTERRUPT_ENABLE 0x3 > +#define TRP0_INTERRUPT_ENABLE 0x1 > +#define DRP0_INTERRUPT_ENABLE BIT(6) > +#define SB_DB_DRP_INTERRUPT_ENABLE 0x3 > + > +enum { > + LLCC_DRAM_CE = 0, > + LLCC_DRAM_UE, > + LLCC_TRAM_CE, > + LLCC_TRAM_UE, > + LLCC_ERR_TYPE_MAX = LLCC_TRAM_UE + 1, What's the point? > +}; > + > +static int qcom_llcc_core_setup(struct regmap *llcc_bcast_regmap) > +{ > + u32 sb_err_threshold; > + int ret; > + > + /* Enable TRP in instance 2 of common interrupt enable register */ What is TRP? > + ret = regmap_update_bits(llcc_bcast_regmap, CMN_INTERRUPT_2_ENABLE, > + TRP0_INTERRUPT_ENABLE, > + TRP0_INTERRUPT_ENABLE); > + if (ret) > + return ret; > + > + /* Enable ECC interrupts on Tag Ram */ > + ret = regmap_update_bits(llcc_bcast_regmap, TRP_INTERRUPT_0_ENABLE, > + SB_DB_TRP_INTERRUPT_ENABLE, > + SB_DB_TRP_INTERRUPT_ENABLE); > + if (ret) > + return ret; > + > + /* Enable SB error for Data RAM */ SB is single bit? > + sb_err_threshold = (SB_ERROR_THRESHOLD << SB_ERROR_THRESHOLD_SHIFT); > + ret = regmap_write(llcc_bcast_regmap, DRP_ECC_ERROR_CFG, > + sb_err_threshold); > + if (ret) > + return ret; > + > + /* Enable DRP in instance 2 of common interrupt enable register */ DRP is double bit? > + ret = regmap_update_bits(llcc_bcast_regmap, CMN_INTERRUPT_2_ENABLE, > + DRP0_INTERRUPT_ENABLE, > + DRP0_INTERRUPT_ENABLE); > + if (ret) > + return ret; > + > + /* Enable ECC interrupts on Data Ram */ > + ret = regmap_write(llcc_bcast_regmap, DRP_INTERRUPT_ENABLE, > + SB_DB_DRP_INTERRUPT_ENABLE); > + return ret; > +} > + > +/* Clear the error interrupt and counter registers */ > +static int > +qcom_llcc_clear_errors_status(int err_type, struct llcc_drv_data *drv) > +{ > + int ret = 0; > + > + switch (err_type) { > + case LLCC_DRAM_CE: > + case LLCC_DRAM_UE: > + /* Clear the interrupt */ > + ret = regmap_write(drv->bcast_regmap, DRP_INTERRUPT_CLEAR, > + DRP_TRP_INT_CLEAR); > + if (ret) > + return ret; > + > + /* Clear the counters */ A lot of these comments are just saying what the register write is doing which is fairly obvious from the register names. Can you remove these obvious comments? > + ret = regmap_write(drv->bcast_regmap, DRP_ECC_ERROR_CNTR_CLEAR, > + DRP_TRP_CNT_CLEAR); > + if (ret) > + return ret; > + break; > + case LLCC_TRAM_CE: > + case LLCC_TRAM_UE: > + ret = regmap_write(drv->bcast_regmap, TRP_INTERRUPT_0_CLEAR, > + DRP_TRP_INT_CLEAR); > + if (ret) > + return ret; > + > + ret = regmap_write(drv->bcast_regmap, TRP_ECC_ERROR_CNTR_CLEAR, > + DRP_TRP_CNT_CLEAR); > + if (ret) > + return ret; > + break; > + } > + return ret; > +} > + > +/* Dump Syndrome registers data for Tag RAM, Data RAM bit errors*/ > +static int > +dump_syn_reg_values(struct llcc_drv_data *drv, u32 bank, int err_type) > +{ > + struct llcc_edac_reg_data *reg_data = &(drv->edac_reg[err_type]); const? Also, drop the useless parenthesis. > + int err_cnt, err_ways, ret, i; > + u32 synd_reg, synd_val; > + > + for (i = 0; i < reg_data->reg_cnt; i++) { > + synd_reg = reg_data->synd_reg + (i * 4); > + ret = regmap_read(drv->regmap, drv->offsets[bank] + synd_reg, > + &synd_val); > + if (ret) > + goto clear; > + edac_printk(KERN_CRIT, EDAC_LLCC, "%s: ECC_SYN%d: 0x%8x\n", > + reg_data->err_name, i, synd_val); > + } > + > + ret = regmap_read(drv->regmap, > + drv->offsets[bank] + reg_data->err_status_reg, > + &err_cnt); > + if (ret) > + goto clear; > + > + err_cnt &= reg_data->err_count_mask; > + err_cnt >>= reg_data->err_count_shift; > + edac_printk(KERN_CRIT, EDAC_LLCC, "%s: error count: 0x%4x\n", > + reg_data->err_name, err_cnt); > + > + ret = regmap_read(drv->regmap, > + drv->offsets[bank] + reg_data->err_ways_status, > + &err_ways); > + if (ret) > + goto clear; > + > + err_ways &= reg_data->err_ways_mask; > + err_ways >>= reg_data->err_ways_shift; > + > + edac_printk(KERN_CRIT, EDAC_LLCC, "%s: error ways: 0x%4x\n", > + reg_data->err_name, err_ways); > + > +clear: > + ret = qcom_llcc_clear_errors_status(err_type, drv); > + return ret; Just 'return qcom_llcc_clear_errors_status(...)' > +} > + > +static int > +dump_syn_reg(struct edac_device_ctl_info *edev_ctl, int err_type, u32 bank) > +{ > + struct llcc_drv_data *drv = edev_ctl->pvt_info; > + int ret = 0; Please don't assign local variables at the start of a function and then reassign them again immediately after. It hides initialization problems. > + > + ret = dump_syn_reg_values(drv, bank, err_type); > + if (ret) > + return ret; > + > + switch (err_type) { > + case LLCC_DRAM_CE: > + edac_device_handle_ce(edev_ctl, 0, bank, > + "LLCC Data RAM correctable Error"); > + break; > + case LLCC_DRAM_UE: > + edac_device_handle_ue(edev_ctl, 0, bank, > + "LLCC Data RAM uncorrectable Error"); > + break; > + case LLCC_TRAM_CE: > + edac_device_handle_ce(edev_ctl, 0, bank, > + "LLCC Tag RAM correctable Error"); > + break; > + case LLCC_TRAM_UE: > + edac_device_handle_ue(edev_ctl, 0, bank, > + "LLCC Tag RAM uncorrectable Error"); > + break; > + } > + > + return ret; > +} > + > +static irqreturn_t > +llcc_ecc_irq_handler(int irq, void *edev_ctl) > +{ > + struct edac_device_ctl_info *edac_dev_ctl; > + irqreturn_t irq_rc = IRQ_NONE; > + u32 drp_error, trp_error, i; > + struct llcc_drv_data *drv; > + int ret; > + > + edac_dev_ctl = (struct edac_device_ctl_info *)edev_ctl; Remove useless cast please. > + drv = edac_dev_ctl->pvt_info; > + > + for (i = 0; i < drv->num_banks; i++) { > + /* Look for Data RAM errors */ > + ret = regmap_read(drv->regmap, > + drv->offsets[i] + DRP_INTERRUPT_STATUS, > + &drp_error); > + if (ret) > + return irq_rc; > + > + if (drp_error & SB_ECC_ERROR) { > + edac_printk(KERN_CRIT, EDAC_LLCC, > + "Single Bit Error detected in Data Ram\n"); > + ret = dump_syn_reg(edev_ctl, LLCC_DRAM_CE, i); > + if (!ret) > + irq_rc = IRQ_HANDLED; Rename 'irq_rc' to 'handled' and make it a bool and then break from all these 'return irq_rc' places and 'if (handled) return IRQ_HANDLED; return IRQ_NONE;' at the end of this function so we have one exit point from the irq handler. > + } else if (drp_error & DB_ECC_ERROR) { > + edac_printk(KERN_CRIT, EDAC_LLCC, > + "Double Bit Error detected in Data Ram\n"); > + ret = dump_syn_reg(edev_ctl, LLCC_DRAM_UE, i); > + if (!ret) > + irq_rc = IRQ_HANDLED; > + } > + > + /* Look for Tag RAM errors */ > + ret = regmap_read(drv->regmap, > + drv->offsets[i] + TRP_INTERRUPT_0_STATUS, > + &trp_error); > + if (ret) > + return irq_rc; > + > + if (trp_error & SB_ECC_ERROR) { > + edac_printk(KERN_CRIT, EDAC_LLCC, > + "Single Bit Error detected in Tag Ram\n"); > + ret = dump_syn_reg(edev_ctl, LLCC_TRAM_CE, i); > + if (!ret) > + irq_rc = IRQ_HANDLED; > + } else if (trp_error & DB_ECC_ERROR) { > + edac_printk(KERN_CRIT, EDAC_LLCC, > + "Double Bit Error detected in Tag Ram\n"); > + ret = dump_syn_reg(edev_ctl, LLCC_TRAM_UE, i); > + if (!ret) > + irq_rc = IRQ_HANDLED; > + } > + } > + > + return irq_rc; > +} > + > +static void llcc_edac_reg_data_init(struct llcc_edac_reg_data *edac_reg) > +{ > + > + struct llcc_edac_reg_data *reg_data; > + > + /* Initialize register info for LLCC_DRAM_CE */ > + reg_data = &edac_reg[LLCC_DRAM_CE]; > + reg_data->err_name = "DRAM Single-bit"; > + reg_data->reg_cnt = DRP_SYN_REG_CNT; > + reg_data->synd_reg = DRP_ECC_SB_ERR_SYN0; > + reg_data->err_status_reg = DRP_ECC_ERROR_STATUS1; > + reg_data->err_count_mask = ECC_SB_ERR_COUNT_MASK; > + reg_data->err_count_shift = ECC_SB_ERR_COUNT_SHIFT; > + reg_data->err_ways_status = DRP_ECC_ERROR_STATUS0; > + reg_data->err_ways_mask = ECC_SB_ERR_WAYS_MASK; > + > + /* Initialize register info for LLCC_DRAM_UE */ > + reg_data = &edac_reg[LLCC_DRAM_UE]; > + reg_data->err_name = "DRAM Double-bit"; > + reg_data->reg_cnt = DRP_SYN_REG_CNT; > + reg_data->synd_reg = DRP_ECC_DB_ERR_SYN0; > + reg_data->err_status_reg = DRP_ECC_ERROR_STATUS1; > + reg_data->err_count_mask = ECC_DB_ERR_COUNT_MASK; > + reg_data->err_ways_status = DRP_ECC_ERROR_STATUS0; > + reg_data->err_ways_mask = ECC_DB_ERR_WAYS_MASK; > + reg_data->err_ways_shift = ECC_DB_ERR_WAYS_SHIFT; > + > + /* Initialize register info for LLCC_TRAM_CE */ > + reg_data = &edac_reg[LLCC_TRAM_CE]; > + reg_data->err_name = "TRAM Single-bit"; > + reg_data->reg_cnt = TRP_SYN_REG_CNT; > + reg_data->synd_reg = TRP_ECC_SB_ERR_SYN0; > + reg_data->err_status_reg = TRP_ECC_ERROR_STATUS1; > + reg_data->err_count_mask = ECC_SB_ERR_COUNT_MASK; > + reg_data->err_count_shift = ECC_SB_ERR_COUNT_SHIFT; > + reg_data->err_ways_status = TRP_ECC_ERROR_STATUS0; > + reg_data->err_ways_mask = ECC_SB_ERR_WAYS_MASK; > + > + /* Initialize register info for LLCC_TRAM_UE */ > + reg_data = &edac_reg[LLCC_TRAM_UE]; > + reg_data->err_name = "TRAM Double-bit"; > + reg_data->reg_cnt = TRP_SYN_REG_CNT; > + reg_data->synd_reg = TRP_ECC_DB_ERR_SYN0; > + reg_data->err_status_reg = TRP_ECC_ERROR_STATUS1; > + reg_data->err_count_mask = ECC_DB_ERR_COUNT_MASK; > + reg_data->err_ways_status = TRP_ECC_ERROR_STATUS0; > + reg_data->err_ways_mask = ECC_DB_ERR_WAYS_MASK; > + reg_data->err_ways_shift = ECC_DB_ERR_WAYS_SHIFT; Why can't these things be a static const array that the driver looks up in? What's the point of it being per-device instance? > +} > + > +static int qcom_llcc_edac_probe(struct platform_device *pdev) > +{ > + struct llcc_drv_data *llcc_driv_data = pdev->dev.platform_data; > + struct edac_device_ctl_info *edev_ctl; > + struct device *dev = &pdev->dev; > + int ecc_irq; > + int rc; > + > + /* Initialize register set for the error types*/ > + llcc_driv_data->edac_reg = devm_kcalloc(dev, LLCC_ERR_TYPE_MAX, > + sizeof(struct llcc_edac_reg_data), > + GFP_KERNEL); > + llcc_edac_reg_data_init(llcc_driv_data->edac_reg); > + > + rc = qcom_llcc_core_setup(llcc_driv_data->bcast_regmap); > + if (rc) > + return rc; > + > + /* Allocate edac control info */ > + edev_ctl = edac_device_alloc_ctl_info(0, "qcom-llcc", 1, "bank", > + llcc_driv_data->num_banks, 1, > + NULL, 0, > + edac_device_alloc_index()); > + > + if (!edev_ctl) > + return -ENOMEM; > + > + edev_ctl->dev = dev; > + edev_ctl->mod_name = dev_name(dev); > + edev_ctl->dev_name = dev_name(dev); > + edev_ctl->ctl_name = "llcc"; > + edev_ctl->panic_on_ue = LLCC_ERP_PANIC_ON_UE; > + edev_ctl->pvt_info = llcc_driv_data; > + > + rc = edac_device_add_device(edev_ctl); > + if (rc) > + goto out_mem; > + > + platform_set_drvdata(pdev, edev_ctl); > + > + /* Request for ecc irq */ > + ecc_irq = llcc_driv_data->ecc_irq; > + if (ecc_irq < 0) { > + rc = -ENODEV; Return error code from platform_get_irq()? Oh, that's weird. Why is some other driver getting and storing the irq away? Hopefully it isn't a shared IRQ. > + goto out_dev; > + } > + rc = devm_request_irq(dev, ecc_irq, llcc_ecc_irq_handler, > + IRQF_TRIGGER_HIGH, "llcc_ecc", edev_ctl); > + if (rc) > + goto out_dev; > + > + return rc; > + > +out_dev: > + edac_device_del_device(edev_ctl->dev); > +out_mem: > + edac_device_free_ctl_info(edev_ctl); > + > + return rc; > +} > + > +static int qcom_llcc_edac_remove(struct platform_device *pdev) > +{ > + struct edac_device_ctl_info *edev_ctl = dev_get_drvdata(&pdev->dev); > + > + edac_device_del_device(edev_ctl->dev); > + edac_device_free_ctl_info(edev_ctl); > + platform_set_drvdata(pdev, NULL); No need. Don't set platform data to NULL. > + > + return 0; > +} > + > +static const struct of_device_id qcom_llcc_edac_match_table[] = { > +#ifdef EDAC_QCOM_LLCC Huh? Shouldn't this driver only be compiled when this config is enabled? > + { .compatible = "qcom,llcc-edac" }, > +#endif > + { }, > +}; > + > +static struct platform_driver qcom_llcc_edac_driver = { > + .probe = qcom_llcc_edac_probe, > + .remove = qcom_llcc_edac_remove, > + .driver = { > + .name = "qcom_llcc_edac", > + .of_match_table = qcom_llcc_edac_match_table, > + }, > +}; > +module_platform_driver(qcom_llcc_edac_driver); > + > +MODULE_DESCRIPTION("QCOM EDAC driver"); > +MODULE_LICENSE("GPL v2"); > diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h > index 2e4b34d..25096e0 100644 > --- a/include/linux/soc/qcom/llcc-qcom.h > +++ b/include/linux/soc/qcom/llcc-qcom.h > @@ -84,6 +84,7 @@ struct llcc_drv_data { > struct regmap *regmap; > struct regmap *bcast_regmap; > const struct llcc_slice_config *cfg; > + struct llcc_edac_reg_data *edac_reg; Why a pointer instead of an array of proper size? > struct mutex lock; > u32 cfg_size; > u32 max_slices; > @@ -93,6 +94,30 @@ struct llcc_drv_data { > int ecc_irq; > }; > > +/** > + * llcc_edac_reg_data - llcc edac registers data for each error type > + * @err_name: name of the error > + * @reg_cnt: number of registers > + * @synd_reg: syndrome register address > + * @err_status_reg: Status register address to read the error count > + * @err_count_mask: Mask value to get the error count > + * @err_count_shift: Shift value to get the error count > + * @err_ways_status: Status register address to read error ways > + * @err_ways_mask: Mask value to get the error ways > + * @err_ways_shift: Shift value to get the error ways > + */ > +struct llcc_edac_reg_data { > + char *err_name; Just name instead? > + int reg_cnt; > + int synd_reg; > + int err_status_reg; > + int err_count_mask; > + int err_count_shift; > + int err_ways_status; > + int err_ways_mask; > + int err_ways_shift; Do any of these need to be signed types? Use u8 for register offsets and shifts, u32 for masks and unsigned ints for counts please. > +};