On Thu, Dec 05, 2019 at 09:53:18AM +0000, Sai Prakash Ranjan wrote: > Kryo{3,4}XX CPU cores implement RAS extensions to support > Error Correcting Code(ECC). Currently all Kryo{3,4}XX CPU > cores (gold/silver a.k.a big/LITTLE) support ECC via RAS. via RAS what? ARM64_RAS_EXTN? In any case, this needs James to look at and especially if there's some ARM-generic functionality in there which should be shared, of course. > This adds an interrupt based driver for those CPUs and s/This adds/Add/ > provides an optional polling of error recording system > registers. > > Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@xxxxxxxxxxxxxx> > --- > MAINTAINERS | 7 + > drivers/edac/Kconfig | 20 + > drivers/edac/Makefile | 1 + > drivers/edac/qcom_kryo_edac.c | 679 ++++++++++++++++++++++++++++++++++ > 4 files changed, 707 insertions(+) > create mode 100644 drivers/edac/qcom_kryo_edac.c > > diff --git a/MAINTAINERS b/MAINTAINERS > index c2d80079dccc..f58c93f963f6 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -6049,6 +6049,13 @@ L: linux-edac@xxxxxxxxxxxxxxx > S: Maintained > F: drivers/edac/qcom_edac.c > > +EDAC-KRYO-QCOM > +M: Sai Prakash Ranjan <saiprakash.ranjan@xxxxxxxxxxxxxx> > +L: linux-arm-msm@xxxxxxxxxxxxxxx > +L: linux-edac@xxxxxxxxxxxxxxx > +S: Maintained > +F: drivers/edac/qcom_kryo_edac.c > + > EDIROL UA-101/UA-1000 DRIVER > M: Clemens Ladisch <clemens@xxxxxxxxxx> > L: alsa-devel@xxxxxxxxxxxxxxxx (moderated for non-subscribers) > diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig > index 417dad635526..cd78ac2917c9 100644 > --- a/drivers/edac/Kconfig > +++ b/drivers/edac/Kconfig > @@ -508,6 +508,26 @@ config EDAC_QCOM > For debugging issues having to do with stability and overall system > health, you should probably say 'Y' here. > > +config EDAC_QCOM_KRYO > + tristate "QCOM Kryo EDAC for CPU L1/L2/L3-SCU caches" > + depends on ARCH_QCOM && ARM64_RAS_EXTN > + help > + Support for Error detection and correction on Kryo Gold and Silver CPU > + cores with RAS extensions. Currently it detects and reports all Single > + Bit Errors (SBEs) and Double Bit Errors (DBEs). > + > + For debugging issues having to do with stability and overall system > + health, you should probably say 'Y' here. > + > +config EDAC_QCOM_KRYO_POLL > + depends on EDAC_QCOM_KRYO > + bool "Poll on Kryo ECC registers" > + help > + This option chooses whether or not you want to poll on the Kryo ECC > + registers. When this is enabled, the polling rate can be set as a > + module parameter. By default, it will call the polling function every > + second. Why is this a separate option and why should people use that? Can the polling/irq be switched automatically? > + > config EDAC_ASPEED > tristate "Aspeed AST 2500 SoC" > depends on MACH_ASPEED_G5 > diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile > index d77200c9680b..29edcfa6ec0e 100644 > --- a/drivers/edac/Makefile > +++ b/drivers/edac/Makefile > @@ -85,5 +85,6 @@ obj-$(CONFIG_EDAC_SYNOPSYS) += synopsys_edac.o > obj-$(CONFIG_EDAC_XGENE) += xgene_edac.o > obj-$(CONFIG_EDAC_TI) += ti_edac.o > obj-$(CONFIG_EDAC_QCOM) += qcom_edac.o > +obj-$(CONFIG_EDAC_QCOM_KRYO) += qcom_kryo_edac.o What is the difference between this new driver and the qcom_edac one? Can functionality be shared? Should this new one be called simply kryo_edac instead? > obj-$(CONFIG_EDAC_ASPEED) += aspeed_edac.o > obj-$(CONFIG_EDAC_BLUEFIELD) += bluefield_edac.o > diff --git a/drivers/edac/qcom_kryo_edac.c b/drivers/edac/qcom_kryo_edac.c > new file mode 100644 > index 000000000000..05b60ad3cb0e > --- /dev/null > +++ b/drivers/edac/qcom_kryo_edac.c > @@ -0,0 +1,679 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Copyright (c) 2019, The Linux Foundation. All rights reserved. > + */ > + > +#include <linux/bitfield.h> > +#include <linux/cpu_pm.h> > +#include <linux/of_irq.h> > +#include <linux/smp.h> > + > +#include <asm/cputype.h> > +#include <asm/sysreg.h> > + > +#include "edac_device.h" > +#include "edac_mc.h" > + > +#define DRV_NAME "qcom_kryo_edac" > + > +/* > + * ARM Cortex-A55, Cortex-A75, Cortex-A76 TRM Chapter B3.3 Chapter? Where? URL? > + * ARM DSU TRM Chapter B2.3 > + * CFI = Corrected Fault Handling interrupt, FI = Fault handling interrupt > + * UI = Uncorrected error recovery interrupt, ED = Error Detection > + */ > +#define KRYO_ERRXCTLR_ED BIT(0) > +#define KRYO_ERRXCTLR_UI BIT(2) > +#define KRYO_ERRXCTLR_FI BIT(3) > +#define KRYO_ERRXCTLR_CFI BIT(8) > +#define KRYO_ERRXCTLR_ENABLE (KRYO_ERRXCTLR_CFI | KRYO_ERRXCTLR_FI | \ > + KRYO_ERRXCTLR_UI | KRYO_ERRXCTLR_ED) > + > +/* > + * ARM Cortex-A55, Cortex-A75, Cortex-A76 TRM Chapter B3.4 > + * ARM DSU TRM Chapter B2.4 > + */ > +#define KRYO_ERRXFR_ED GENMASK(1, 0) > +#define KRYO_ERRXFR_DE GENMASK(3, 2) > +#define KRYO_ERRXFR_UI GENMASK(5, 4) > +#define KRYO_ERRXFR_FI GENMASK(7, 6) > +#define KRYO_ERRXFR_UE GENMASK(9, 8) > +#define KRYO_ERRXFR_CFI GENMASK(11, 10) > +#define KRYO_ERRXFR_CEC GENMASK(14, 12) > +#define KRYO_ERRXFR_RP BIT(15) > +#define KRYO_ERRXFR_SUPPORTED (KRYO_ERRXFR_ED | KRYO_ERRXFR_DE | \ > + KRYO_ERRXFR_UI | KRYO_ERRXFR_FI | \ > + KRYO_ERRXFR_UE | KRYO_ERRXFR_CFI | \ > + KRYO_ERRXFR_CEC | KRYO_ERRXFR_RP) > + > +/* > + * ARM Cortex-A55, Cortex-A75, Cortex-A76 TRM Chapter B3.5 > + * ARM DSU TRM Chapter B2.5 > + */ > +#define KRYO_ERRXMISC0_CECR GENMASK_ULL(38, 32) > +#define KRYO_ERRXMISC0_CECO GENMASK_ULL(46, 40) > + > +/* ARM Cortex-A76 TRM Chapter B3.5 */ > +#define KRYO_ERRXMISC0_UNIT GENMASK(3, 0) > +#define KRYO_ERRXMISC0_LVL GENMASK(3, 1) > + > +/* ARM Cortex-A76 TRM Chapter B3.10 has SERR bitfields 4:0 > + * but Cortex-A55, Cortex-A75 and DSU TRM has SERR bitfields 7:0. > + * Since max error record is 21, we can use bitfields 4:0 for > + * Kryo{3,4}XX CPUs. > + */ > +#define KRYO_ERRXSTATUS_SERR GENMASK(4, 0) > +#define KRYO_ERRXSTATUS_DE BIT(23) > +#define KRYO_ERRXSTATUS_CE GENMASK(25, 24) > +#define KRYO_ERRXSTATUS_MV BIT(26) > +#define KRYO_ERRXSTATUS_UE BIT(29) > +#define KRYO_ERRXSTATUS_VALID BIT(30) > + > +/* ARM Cortex-A76 TRM Chapter B3.5 > + * IC = Instruction Cache, DC = Data Cache > + */ > +#define KRYO_L1_UNIT_IC 0x1 > +#define KRYO_L2_UNIT_TLB 0x2 > +#define KRYO_L1_UNIT_DC 0x4 > +#define KRYO_L2_UNIT 0x8 > + > +/* > + * ARM Cortex-A55 TRM Chapter B2.36 > + * ARM Cortex-A75, Cortex-A76 TRM Chapter B2.37 > + */ > +#define KRYO_ERR_RECORD_L1_L2 0x0 > +#define KRYO_ERR_RECORD_L3 0x1 > + > +/* ARM DSU TRM Chapter B2.10 */ > +#define BUS_ERROR 0x12 > + > +/* QCOM Kryo CPU part numbers */ > +#define KRYO3XX_GOLD 0x802 > +#define KRYO4XX_GOLD 0x804 > +#define KRYO4XX_SILVER_V1 0x803 > +#define KRYO4XX_SILVER_V2 0x805 > + > +#define KRYO_EDAC_MSG_MAX 256 > + > +static int poll_msec = 1000; > +module_param(poll_msec, int, 0444); > + > +enum { > + KRYO_L1 = 0, > + KRYO_L2, > + KRYO_L3, > +}; > + > +/* CE = Corrected Error, UE = Uncorrected Error, DE = Deferred Error */ > +enum { > + KRYO_L1_CE = 0, > + KRYO_L1_UE, > + KRYO_L1_DE, > + KRYO_L2_CE, > + KRYO_L2_UE, > + KRYO_L2_DE, > + KRYO_L3_CE, > + KRYO_L3_UE, > + KRYO_L3_DE, > +}; > + > +struct error_record { > + u32 error_code; > + const char *error_msg; > +}; > + > +struct error_type { > + void (*fn)(struct edac_device_ctl_info *edev_ctl, > + int inst_nr, int block_nr, const char *msg); > + const char *msg; > +}; > + > +/* > + * ARM Cortex-A55, Cortex-A75, Cortex-A76 TRM Chapter B3.10 > + * ARM DSU TRM Chapter B2.10 > + */ > +static const struct error_record serror_record[] = { > + { 0x1, "Errors due to fault injection" }, > + { 0x2, "ECC error from internal data buffer" }, > + { 0x6, "ECC error on cache data RAM" }, > + { 0x7, "ECC error on cache tag or dirty RAM" }, > + { 0x8, "Parity error on TLB data RAM" }, > + { 0x9, "Parity error on TLB tag RAM" }, > + { 0x12, "Error response for a cache copyback" }, > + { 0x15, "Deferred error not supported" }, > +}; > + > +static const struct error_type err_type[] = { > + { edac_device_handle_ce, "Kryo L1 Corrected Error" }, > + { edac_device_handle_ue, "Kryo L1 Uncorrected Error" }, > + { edac_device_handle_ue, "Kryo L1 Deferred Error" }, > + { edac_device_handle_ce, "Kryo L2 Corrected Error" }, > + { edac_device_handle_ue, "Kryo L2 Uncorrected Error" }, > + { edac_device_handle_ue, "Kryo L2 Deferred Error" }, > + { edac_device_handle_ce, "L3 Corrected Error" }, > + { edac_device_handle_ue, "L3 Uncorrected Error" }, > + { edac_device_handle_ue, "L3 Deferred Error" }, > +}; > + All that is not really needed - you can put the whole error type detection and dumping in kryo_check_err_type() in nicely readable switch-case statement. No need for the function pointers and special structs. > +static struct edac_device_ctl_info __percpu *edac_dev; > +static struct edac_device_ctl_info *drv_edev_ctl; > + > +static const char *get_error_msg(u64 errxstatus) > +{ > + const struct error_record *rec; > + u32 errxstatus_serr; > + > + errxstatus_serr = FIELD_GET(KRYO_ERRXSTATUS_SERR, errxstatus); > + > + for (rec = serror_record; rec->error_code; rec++) { > + if (errxstatus_serr == rec->error_code) > + return rec->error_msg; > + } > + > + return NULL; > +} > + > +static void dump_syndrome_reg(int error_type, int level, > + u64 errxstatus, u64 errxmisc, > + struct edac_device_ctl_info *edev_ctl) > +{ > + char msg[KRYO_EDAC_MSG_MAX]; > + const char *error_msg; > + int cpu; > + > + cpu = raw_smp_processor_id(); Why raw_? > + > + error_msg = get_error_msg(errxstatus); > + if (!error_msg) > + return; > + > + snprintf(msg, KRYO_EDAC_MSG_MAX, > + "CPU%d: %s, ERRXSTATUS_EL1:%#llx ERRXMISC0_EL1:%#llx, %s", > + cpu, err_type[error_type].msg, errxstatus, errxmisc, > + error_msg); > + > + err_type[error_type].fn(edev_ctl, 0, level, msg); > +} ... > +static int kryo_l1_l2_setup_irq(struct platform_device *pdev, > + struct edac_device_ctl_info *edev_ctl) > +{ > + int cpu, errirq, faultirq, ret; > + > + edac_dev = devm_alloc_percpu(&pdev->dev, *edac_dev); > + if (!edac_dev) > + return -ENOMEM; > + > + for_each_possible_cpu(cpu) { > + preempt_disable(); > + per_cpu(edac_dev, cpu) = edev_ctl; > + preempt_enable(); > + } That sillyness doesn't belong here, if at all. ... > +static void kryo_poll_cache_error(struct edac_device_ctl_info *edev_ctl) > +{ > + if (!edev_ctl) > + edev_ctl = drv_edev_ctl; That's silly. > + > + on_each_cpu(kryo_check_l1_l2_ecc, edev_ctl, 1); > + kryo_check_l3_scu_ecc(edev_ctl); > +} ... > +static int qcom_kryo_edac_probe(struct platform_device *pdev) > +{ > + struct edac_device_ctl_info *edev_ctl; > + struct device *dev = &pdev->dev; > + int ret; > + > + qcom_kryo_edac_setup(); This function needs to have a return value saying whether it did setup the hw properly or not and the probe function needs to return here if not. -- Regards/Gruss, Boris. https://people.kernel.org/tglx/notes-about-netiquette