Hi Sai, On Thu, Dec 5, 2019 at 1:53 AM Sai Prakash Ranjan <saiprakash.ranjan@xxxxxxxxxxxxxx> wrote: > > Kryo{3,4}XX CPU cores implement RAS extensions to support > Error Correcting Code(ECC). Currently all Kryo{3,4}XX CPU > cores (gold/silver a.k.a big/LITTLE) support ECC via RAS. > This adds an interrupt based driver for those CPUs and > provides an optional polling of error recording system > registers. > > Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@xxxxxxxxxxxxxx> > --- > MAINTAINERS | 7 + > drivers/edac/Kconfig | 20 + > drivers/edac/Makefile | 1 + > drivers/edac/qcom_kryo_edac.c | 679 ++++++++++++++++++++++++++++++++++ > 4 files changed, 707 insertions(+) > create mode 100644 drivers/edac/qcom_kryo_edac.c > > diff --git a/MAINTAINERS b/MAINTAINERS > index c2d80079dccc..f58c93f963f6 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -6049,6 +6049,13 @@ L: linux-edac@xxxxxxxxxxxxxxx > S: Maintained > F: drivers/edac/qcom_edac.c > > +EDAC-KRYO-QCOM > +M: Sai Prakash Ranjan <saiprakash.ranjan@xxxxxxxxxxxxxx> > +L: linux-arm-msm@xxxxxxxxxxxxxxx > +L: linux-edac@xxxxxxxxxxxxxxx > +S: Maintained > +F: drivers/edac/qcom_kryo_edac.c > + > EDIROL UA-101/UA-1000 DRIVER > M: Clemens Ladisch <clemens@xxxxxxxxxx> > L: alsa-devel@xxxxxxxxxxxxxxxx (moderated for non-subscribers) > diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig > index 417dad635526..cd78ac2917c9 100644 > --- a/drivers/edac/Kconfig > +++ b/drivers/edac/Kconfig > @@ -508,6 +508,26 @@ config EDAC_QCOM > For debugging issues having to do with stability and overall system > health, you should probably say 'Y' here. > > +config EDAC_QCOM_KRYO > + tristate "QCOM Kryo EDAC for CPU L1/L2/L3-SCU caches" > + depends on ARCH_QCOM && ARM64_RAS_EXTN > + help > + Support for Error detection and correction on Kryo Gold and Silver CPU > + cores with RAS extensions. Currently it detects and reports all Single > + Bit Errors (SBEs) and Double Bit Errors (DBEs). > + > + For debugging issues having to do with stability and overall system > + health, you should probably say 'Y' here. > + > +config EDAC_QCOM_KRYO_POLL > + depends on EDAC_QCOM_KRYO > + bool "Poll on Kryo ECC registers" > + help > + This option chooses whether or not you want to poll on the Kryo ECC > + registers. When this is enabled, the polling rate can be set as a > + module parameter. By default, it will call the polling function every > + second. > + > config EDAC_ASPEED > tristate "Aspeed AST 2500 SoC" > depends on MACH_ASPEED_G5 > diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile > index d77200c9680b..29edcfa6ec0e 100644 > --- a/drivers/edac/Makefile > +++ b/drivers/edac/Makefile > @@ -85,5 +85,6 @@ obj-$(CONFIG_EDAC_SYNOPSYS) += synopsys_edac.o > obj-$(CONFIG_EDAC_XGENE) += xgene_edac.o > obj-$(CONFIG_EDAC_TI) += ti_edac.o > obj-$(CONFIG_EDAC_QCOM) += qcom_edac.o > +obj-$(CONFIG_EDAC_QCOM_KRYO) += qcom_kryo_edac.o > obj-$(CONFIG_EDAC_ASPEED) += aspeed_edac.o > obj-$(CONFIG_EDAC_BLUEFIELD) += bluefield_edac.o > diff --git a/drivers/edac/qcom_kryo_edac.c b/drivers/edac/qcom_kryo_edac.c > new file mode 100644 > index 000000000000..05b60ad3cb0e > --- /dev/null > +++ b/drivers/edac/qcom_kryo_edac.c > @@ -0,0 +1,679 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Copyright (c) 2019, The Linux Foundation. All rights reserved. > + */ > + > +#include <linux/bitfield.h> > +#include <linux/cpu_pm.h> > +#include <linux/of_irq.h> > +#include <linux/smp.h> > + > +#include <asm/cputype.h> > +#include <asm/sysreg.h> > + > +#include "edac_device.h" > +#include "edac_mc.h" > + > +#define DRV_NAME "qcom_kryo_edac" > + > +/* > + * ARM Cortex-A55, Cortex-A75, Cortex-A76 TRM Chapter B3.3 > + * ARM DSU TRM Chapter B2.3 > + * CFI = Corrected Fault Handling interrupt, FI = Fault handling interrupt > + * UI = Uncorrected error recovery interrupt, ED = Error Detection > + */ > +#define KRYO_ERRXCTLR_ED BIT(0) > +#define KRYO_ERRXCTLR_UI BIT(2) > +#define KRYO_ERRXCTLR_FI BIT(3) > +#define KRYO_ERRXCTLR_CFI BIT(8) > +#define KRYO_ERRXCTLR_ENABLE (KRYO_ERRXCTLR_CFI | KRYO_ERRXCTLR_FI | \ > + KRYO_ERRXCTLR_UI | KRYO_ERRXCTLR_ED) > + > +/* > + * ARM Cortex-A55, Cortex-A75, Cortex-A76 TRM Chapter B3.4 > + * ARM DSU TRM Chapter B2.4 > + */ > +#define KRYO_ERRXFR_ED GENMASK(1, 0) > +#define KRYO_ERRXFR_DE GENMASK(3, 2) > +#define KRYO_ERRXFR_UI GENMASK(5, 4) > +#define KRYO_ERRXFR_FI GENMASK(7, 6) > +#define KRYO_ERRXFR_UE GENMASK(9, 8) > +#define KRYO_ERRXFR_CFI GENMASK(11, 10) > +#define KRYO_ERRXFR_CEC GENMASK(14, 12) > +#define KRYO_ERRXFR_RP BIT(15) > +#define KRYO_ERRXFR_SUPPORTED (KRYO_ERRXFR_ED | KRYO_ERRXFR_DE | \ > + KRYO_ERRXFR_UI | KRYO_ERRXFR_FI | \ > + KRYO_ERRXFR_UE | KRYO_ERRXFR_CFI | \ > + KRYO_ERRXFR_CEC | KRYO_ERRXFR_RP) > + > +/* > + * ARM Cortex-A55, Cortex-A75, Cortex-A76 TRM Chapter B3.5 > + * ARM DSU TRM Chapter B2.5 > + */ > +#define KRYO_ERRXMISC0_CECR GENMASK_ULL(38, 32) > +#define KRYO_ERRXMISC0_CECO GENMASK_ULL(46, 40) > + > +/* ARM Cortex-A76 TRM Chapter B3.5 */ > +#define KRYO_ERRXMISC0_UNIT GENMASK(3, 0) > +#define KRYO_ERRXMISC0_LVL GENMASK(3, 1) > + > +/* ARM Cortex-A76 TRM Chapter B3.10 has SERR bitfields 4:0 > + * but Cortex-A55, Cortex-A75 and DSU TRM has SERR bitfields 7:0. > + * Since max error record is 21, we can use bitfields 4:0 for > + * Kryo{3,4}XX CPUs. > + */ > +#define KRYO_ERRXSTATUS_SERR GENMASK(4, 0) > +#define KRYO_ERRXSTATUS_DE BIT(23) > +#define KRYO_ERRXSTATUS_CE GENMASK(25, 24) > +#define KRYO_ERRXSTATUS_MV BIT(26) > +#define KRYO_ERRXSTATUS_UE BIT(29) > +#define KRYO_ERRXSTATUS_VALID BIT(30) > + > +/* ARM Cortex-A76 TRM Chapter B3.5 > + * IC = Instruction Cache, DC = Data Cache > + */ > +#define KRYO_L1_UNIT_IC 0x1 > +#define KRYO_L2_UNIT_TLB 0x2 > +#define KRYO_L1_UNIT_DC 0x4 > +#define KRYO_L2_UNIT 0x8 > + > +/* > + * ARM Cortex-A55 TRM Chapter B2.36 > + * ARM Cortex-A75, Cortex-A76 TRM Chapter B2.37 > + */ > +#define KRYO_ERR_RECORD_L1_L2 0x0 > +#define KRYO_ERR_RECORD_L3 0x1 > + > +/* ARM DSU TRM Chapter B2.10 */ > +#define BUS_ERROR 0x12 > + > +/* QCOM Kryo CPU part numbers */ > +#define KRYO3XX_GOLD 0x802 > +#define KRYO4XX_GOLD 0x804 > +#define KRYO4XX_SILVER_V1 0x803 > +#define KRYO4XX_SILVER_V2 0x805 > + > +#define KRYO_EDAC_MSG_MAX 256 > + > +static int poll_msec = 1000; > +module_param(poll_msec, int, 0444); > + > +enum { > + KRYO_L1 = 0, > + KRYO_L2, > + KRYO_L3, > +}; > + > +/* CE = Corrected Error, UE = Uncorrected Error, DE = Deferred Error */ > +enum { No name? > + KRYO_L1_CE = 0, > + KRYO_L1_UE, > + KRYO_L1_DE, > + KRYO_L2_CE, > + KRYO_L2_UE, > + KRYO_L2_DE, > + KRYO_L3_CE, > + KRYO_L3_UE, > + KRYO_L3_DE, > +}; > + > +struct error_record { > + u32 error_code; > + const char *error_msg; > +}; > + > +struct error_type { > + void (*fn)(struct edac_device_ctl_info *edev_ctl, > + int inst_nr, int block_nr, const char *msg); > + const char *msg; > +}; > + > +/* > + * ARM Cortex-A55, Cortex-A75, Cortex-A76 TRM Chapter B3.10 > + * ARM DSU TRM Chapter B2.10 > + */ > +static const struct error_record serror_record[] = { > + { 0x1, "Errors due to fault injection" }, > + { 0x2, "ECC error from internal data buffer" }, > + { 0x6, "ECC error on cache data RAM" }, > + { 0x7, "ECC error on cache tag or dirty RAM" }, > + { 0x8, "Parity error on TLB data RAM" }, > + { 0x9, "Parity error on TLB tag RAM" }, > + { 0x12, "Error response for a cache copyback" }, > + { 0x15, "Deferred error not supported" }, > +}; > + > +static const struct error_type err_type[] = { > + { edac_device_handle_ce, "Kryo L1 Corrected Error" }, > + { edac_device_handle_ue, "Kryo L1 Uncorrected Error" }, > + { edac_device_handle_ue, "Kryo L1 Deferred Error" }, > + { edac_device_handle_ce, "Kryo L2 Corrected Error" }, > + { edac_device_handle_ue, "Kryo L2 Uncorrected Error" }, > + { edac_device_handle_ue, "Kryo L2 Deferred Error" }, > + { edac_device_handle_ce, "L3 Corrected Error" }, > + { edac_device_handle_ue, "L3 Uncorrected Error" }, > + { edac_device_handle_ue, "L3 Deferred Error" }, > +}; A comment is warranted to indicate that err_type is indexed by the enum, as this would be easy to mess up in later changes. > + > +static struct edac_device_ctl_info __percpu *edac_dev; > +static struct edac_device_ctl_info *drv_edev_ctl; > + > +static const char *get_error_msg(u64 errxstatus) > +{ > + const struct error_record *rec; > + u32 errxstatus_serr; > + > + errxstatus_serr = FIELD_GET(KRYO_ERRXSTATUS_SERR, errxstatus); > + > + for (rec = serror_record; rec->error_code; rec++) { It looks like you expect the table to be zero terminated, but it's not. Add the missing zero entry. > + if (errxstatus_serr == rec->error_code) > + return rec->error_msg; > + } > + > + return NULL; > +} > + > +static void dump_syndrome_reg(int error_type, int level, > + u64 errxstatus, u64 errxmisc, > + struct edac_device_ctl_info *edev_ctl) > +{ > + char msg[KRYO_EDAC_MSG_MAX]; > + const char *error_msg; > + int cpu; > + > + cpu = raw_smp_processor_id(); > + > + error_msg = get_error_msg(errxstatus); > + if (!error_msg) > + return; > + > + snprintf(msg, KRYO_EDAC_MSG_MAX, > + "CPU%d: %s, ERRXSTATUS_EL1:%#llx ERRXMISC0_EL1:%#llx, %s", > + cpu, err_type[error_type].msg, errxstatus, errxmisc, > + error_msg); > + > + err_type[error_type].fn(edev_ctl, 0, level, msg); > +} > + > +static void kryo_check_err_type(u64 errxstatus, u64 errxmisc, > + struct edac_device_ctl_info *edev_ctl, > + int level) > +{ > + u32 errxstatus_ue, errxstatus_ce, errxstatus_de; > + > + errxstatus_ce = FIELD_GET(KRYO_ERRXSTATUS_CE, errxstatus); > + errxstatus_ue = FIELD_GET(KRYO_ERRXSTATUS_UE, errxstatus); > + errxstatus_de = FIELD_GET(KRYO_ERRXSTATUS_DE, errxstatus); > + > + switch (level) { > + case KRYO_L1: > + if (errxstatus_ce) > + dump_syndrome_reg(KRYO_L1_CE, level, errxstatus, > + errxmisc, edev_ctl); > + else if (errxstatus_ue) > + dump_syndrome_reg(KRYO_L1_UE, level, errxstatus, > + errxmisc, edev_ctl); > + else if (errxstatus_de) > + dump_syndrome_reg(KRYO_L1_DE, level, errxstatus, > + errxmisc, edev_ctl); > + else > + edac_printk(KERN_ERR, DRV_NAME, "Unknown error\n"); > + break; > + case KRYO_L2: > + if (errxstatus_ce) > + dump_syndrome_reg(KRYO_L2_CE, level, errxstatus, > + errxmisc, edev_ctl); > + else if (errxstatus_ue) > + dump_syndrome_reg(KRYO_L2_UE, level, errxstatus, > + errxmisc, edev_ctl); > + else if (errxstatus_de) > + dump_syndrome_reg(KRYO_L2_DE, level, errxstatus, > + errxmisc, edev_ctl); > + else > + edac_printk(KERN_ERR, DRV_NAME, "Unknown error\n"); > + break; > + case KRYO_L3: > + if (errxstatus_ce) > + dump_syndrome_reg(KRYO_L3_CE, level, errxstatus, > + errxmisc, edev_ctl); > + else if (errxstatus_ue) > + dump_syndrome_reg(KRYO_L3_UE, level, errxstatus, > + errxmisc, edev_ctl); > + else if (errxstatus_de) > + dump_syndrome_reg(KRYO_L3_DE, level, errxstatus, > + errxmisc, edev_ctl); > + else > + edac_printk(KERN_ERR, DRV_NAME, "Unknown error\n"); > + break; > + default: > + edac_printk(KERN_ERR, DRV_NAME, "Unknown level\n"); > + } > +} > + > +static inline void kryo_clear_error(u64 errxstatus) > +{ > + write_sysreg_s(errxstatus, SYS_ERXSTATUS_EL1); > + isb(); Is the isb() necessary? If so, why not a dsb as well? > +} > + > +static void kryo_parse_l1_l2_cache_error(u64 errxstatus, u64 errxmisc, > + struct edac_device_ctl_info *edev_ctl, > + int cpu) > +{ > + u32 part_num = read_cpuid_part_number(); > + > + switch (part_num) { > + /* Kryo3XX gold CPU cores do not have a UNIT bitfield */ > + case KRYO3XX_GOLD: > + case KRYO4XX_SILVER_V1: > + case KRYO4XX_SILVER_V2: > + switch (FIELD_GET(KRYO_ERRXMISC0_LVL, errxmisc)) { > + case KRYO_L1: > + kryo_check_err_type(errxstatus, errxmisc, > + edev_ctl, KRYO_L1); > + break; > + case KRYO_L2: > + kryo_check_err_type(errxstatus, errxmisc, > + edev_ctl, KRYO_L2); > + break; > + default: > + edac_printk(KERN_ERR, DRV_NAME, > + "silver cpu:%d unknown error: %lu\n", cpu, > + FIELD_GET(KRYO_ERRXMISC0_LVL, errxmisc)); > + } > + break; > + /* Kryo4XX gold CPU cores have a UNIT bitfield to identify levels */ > + case KRYO4XX_GOLD: > + switch (FIELD_GET(KRYO_ERRXMISC0_UNIT, errxmisc)) { > + case KRYO_L1_UNIT_DC: > + case KRYO_L1_UNIT_IC: > + kryo_check_err_type(errxstatus, errxmisc, > + edev_ctl, KRYO_L1); > + break; > + case KRYO_L2_UNIT: > + case KRYO_L2_UNIT_TLB: > + kryo_check_err_type(errxstatus, errxmisc, > + edev_ctl, KRYO_L2); > + break; > + default: > + edac_printk(KERN_ERR, DRV_NAME, > + "gold cpu:%d unknown error: %lu\n", cpu, > + FIELD_GET(KRYO_ERRXMISC0_UNIT, errxmisc)); > + } > + break; > + default: > + edac_printk(KERN_ERR, DRV_NAME, > + "Error in matching cpu%d with part num:%u\n", > + cpu, part_num); > + } > +} > + > +static inline bool kryo_check_regs_valid(u64 errxstatus) > +{ > + /* Check if status and misc regs are valid */ > + if (!(FIELD_GET(KRYO_ERRXSTATUS_VALID, errxstatus)) || > + !(FIELD_GET(KRYO_ERRXSTATUS_MV, errxstatus))) > + return false; > + > + return true; > +} > + > +static void kryo_check_l1_l2_ecc(void *info) > +{ > + struct edac_device_ctl_info *edev_ctl = info; > + u64 errxstatus; > + u64 errxmisc; > + int cpu; > + > + cpu = smp_processor_id(); > + /* We know record 0 is L1 and L2 */ > + write_sysreg_s(0, SYS_ERRSELR_EL1); > + isb(); Another isb I'm not sure about. Is this meant to provide a barrier between ERRSELR and ERXSTATUS? Wouldn't that be dsb, not isb? > + > + errxstatus = read_sysreg_s(SYS_ERXSTATUS_EL1); > + if (!kryo_check_regs_valid(errxstatus)) > + return; > + > + errxmisc = read_sysreg_s(SYS_ERXMISC0_EL1); > + /* Check if L1/L2 error */ > + if (!(FIELD_GET(KRYO_ERRXMISC0_LVL, errxmisc) == KRYO_L1) && > + !(FIELD_GET(KRYO_ERRXMISC0_LVL, errxmisc) == KRYO_L2)) > + return; > + > + kryo_parse_l1_l2_cache_error(errxstatus, errxmisc, edev_ctl, cpu); > + kryo_clear_error(errxstatus); > +} > +