Re: [PATCH 2/2] drivers: edac: Add EDAC support for Kryo CPU caches

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Sai,

On Thu, Dec 5, 2019 at 1:53 AM Sai Prakash Ranjan
<saiprakash.ranjan@xxxxxxxxxxxxxx> wrote:
>
> Kryo{3,4}XX CPU cores implement RAS extensions to support
> Error Correcting Code(ECC). Currently all Kryo{3,4}XX CPU
> cores (gold/silver a.k.a big/LITTLE) support ECC via RAS.
> This adds an interrupt based driver for those CPUs and
> provides an optional polling of error recording system
> registers.
>
> Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@xxxxxxxxxxxxxx>
> ---
>  MAINTAINERS                   |   7 +
>  drivers/edac/Kconfig          |  20 +
>  drivers/edac/Makefile         |   1 +
>  drivers/edac/qcom_kryo_edac.c | 679 ++++++++++++++++++++++++++++++++++
>  4 files changed, 707 insertions(+)
>  create mode 100644 drivers/edac/qcom_kryo_edac.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index c2d80079dccc..f58c93f963f6 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -6049,6 +6049,13 @@ L:       linux-edac@xxxxxxxxxxxxxxx
>  S:     Maintained
>  F:     drivers/edac/qcom_edac.c
>
> +EDAC-KRYO-QCOM
> +M:     Sai Prakash Ranjan <saiprakash.ranjan@xxxxxxxxxxxxxx>
> +L:     linux-arm-msm@xxxxxxxxxxxxxxx
> +L:     linux-edac@xxxxxxxxxxxxxxx
> +S:     Maintained
> +F:     drivers/edac/qcom_kryo_edac.c
> +
>  EDIROL UA-101/UA-1000 DRIVER
>  M:     Clemens Ladisch <clemens@xxxxxxxxxx>
>  L:     alsa-devel@xxxxxxxxxxxxxxxx (moderated for non-subscribers)
> diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
> index 417dad635526..cd78ac2917c9 100644
> --- a/drivers/edac/Kconfig
> +++ b/drivers/edac/Kconfig
> @@ -508,6 +508,26 @@ config EDAC_QCOM
>           For debugging issues having to do with stability and overall system
>           health, you should probably say 'Y' here.
>
> +config EDAC_QCOM_KRYO
> +       tristate "QCOM Kryo EDAC for CPU L1/L2/L3-SCU caches"
> +       depends on ARCH_QCOM && ARM64_RAS_EXTN
> +       help
> +         Support for Error detection and correction on Kryo Gold and Silver CPU
> +         cores with RAS extensions. Currently it detects and reports all Single
> +         Bit Errors (SBEs) and Double Bit Errors (DBEs).
> +
> +         For debugging issues having to do with stability and overall system
> +         health, you should probably say 'Y' here.
> +
> +config EDAC_QCOM_KRYO_POLL
> +       depends on EDAC_QCOM_KRYO
> +       bool "Poll on Kryo ECC registers"
> +       help
> +         This option chooses whether or not you want to poll on the Kryo ECC
> +         registers. When this is enabled, the polling rate can be set as a
> +         module parameter. By default, it will call the polling function every
> +         second.
> +
>  config EDAC_ASPEED
>         tristate "Aspeed AST 2500 SoC"
>         depends on MACH_ASPEED_G5
> diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
> index d77200c9680b..29edcfa6ec0e 100644
> --- a/drivers/edac/Makefile
> +++ b/drivers/edac/Makefile
> @@ -85,5 +85,6 @@ obj-$(CONFIG_EDAC_SYNOPSYS)           += synopsys_edac.o
>  obj-$(CONFIG_EDAC_XGENE)               += xgene_edac.o
>  obj-$(CONFIG_EDAC_TI)                  += ti_edac.o
>  obj-$(CONFIG_EDAC_QCOM)                        += qcom_edac.o
> +obj-$(CONFIG_EDAC_QCOM_KRYO)           += qcom_kryo_edac.o
>  obj-$(CONFIG_EDAC_ASPEED)              += aspeed_edac.o
>  obj-$(CONFIG_EDAC_BLUEFIELD)           += bluefield_edac.o
> diff --git a/drivers/edac/qcom_kryo_edac.c b/drivers/edac/qcom_kryo_edac.c
> new file mode 100644
> index 000000000000..05b60ad3cb0e
> --- /dev/null
> +++ b/drivers/edac/qcom_kryo_edac.c
> @@ -0,0 +1,679 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (c) 2019, The Linux Foundation. All rights reserved.
> + */
> +
> +#include <linux/bitfield.h>
> +#include <linux/cpu_pm.h>
> +#include <linux/of_irq.h>
> +#include <linux/smp.h>
> +
> +#include <asm/cputype.h>
> +#include <asm/sysreg.h>
> +
> +#include "edac_device.h"
> +#include "edac_mc.h"
> +
> +#define DRV_NAME               "qcom_kryo_edac"
> +
> +/*
> + * ARM Cortex-A55, Cortex-A75, Cortex-A76 TRM Chapter B3.3
> + * ARM DSU TRM Chapter B2.3
> + * CFI = Corrected Fault Handling interrupt, FI = Fault handling interrupt
> + * UI = Uncorrected error recovery interrupt, ED = Error Detection
> + */
> +#define KRYO_ERRXCTLR_ED       BIT(0)
> +#define KRYO_ERRXCTLR_UI       BIT(2)
> +#define KRYO_ERRXCTLR_FI       BIT(3)
> +#define KRYO_ERRXCTLR_CFI      BIT(8)
> +#define KRYO_ERRXCTLR_ENABLE   (KRYO_ERRXCTLR_CFI | KRYO_ERRXCTLR_FI | \
> +                                KRYO_ERRXCTLR_UI | KRYO_ERRXCTLR_ED)
> +
> +/*
> + * ARM Cortex-A55, Cortex-A75, Cortex-A76 TRM Chapter B3.4
> + * ARM DSU TRM Chapter B2.4
> + */
> +#define KRYO_ERRXFR_ED         GENMASK(1, 0)
> +#define KRYO_ERRXFR_DE         GENMASK(3, 2)
> +#define KRYO_ERRXFR_UI         GENMASK(5, 4)
> +#define KRYO_ERRXFR_FI         GENMASK(7, 6)
> +#define KRYO_ERRXFR_UE         GENMASK(9, 8)
> +#define KRYO_ERRXFR_CFI                GENMASK(11, 10)
> +#define KRYO_ERRXFR_CEC                GENMASK(14, 12)
> +#define KRYO_ERRXFR_RP         BIT(15)
> +#define KRYO_ERRXFR_SUPPORTED  (KRYO_ERRXFR_ED | KRYO_ERRXFR_DE | \
> +                                KRYO_ERRXFR_UI | KRYO_ERRXFR_FI | \
> +                                KRYO_ERRXFR_UE | KRYO_ERRXFR_CFI | \
> +                                KRYO_ERRXFR_CEC | KRYO_ERRXFR_RP)
> +
> +/*
> + * ARM Cortex-A55, Cortex-A75, Cortex-A76 TRM Chapter B3.5
> + * ARM DSU TRM Chapter B2.5
> + */
> +#define KRYO_ERRXMISC0_CECR    GENMASK_ULL(38, 32)
> +#define KRYO_ERRXMISC0_CECO    GENMASK_ULL(46, 40)
> +
> +/* ARM Cortex-A76 TRM Chapter B3.5 */
> +#define KRYO_ERRXMISC0_UNIT    GENMASK(3, 0)
> +#define KRYO_ERRXMISC0_LVL     GENMASK(3, 1)
> +
> +/* ARM Cortex-A76 TRM Chapter B3.10 has SERR bitfields 4:0
> + * but Cortex-A55, Cortex-A75 and DSU TRM has SERR bitfields 7:0.
> + * Since max error record is 21, we can use bitfields 4:0 for
> + * Kryo{3,4}XX CPUs.
> + */
> +#define KRYO_ERRXSTATUS_SERR   GENMASK(4, 0)
> +#define KRYO_ERRXSTATUS_DE     BIT(23)
> +#define KRYO_ERRXSTATUS_CE     GENMASK(25, 24)
> +#define KRYO_ERRXSTATUS_MV     BIT(26)
> +#define KRYO_ERRXSTATUS_UE     BIT(29)
> +#define KRYO_ERRXSTATUS_VALID  BIT(30)
> +
> +/* ARM Cortex-A76 TRM Chapter B3.5
> + * IC = Instruction Cache, DC = Data Cache
> + */
> +#define KRYO_L1_UNIT_IC                0x1
> +#define KRYO_L2_UNIT_TLB       0x2
> +#define KRYO_L1_UNIT_DC                0x4
> +#define KRYO_L2_UNIT           0x8
> +
> +/*
> + * ARM Cortex-A55 TRM Chapter B2.36
> + * ARM Cortex-A75, Cortex-A76 TRM Chapter B2.37
> + */
> +#define KRYO_ERR_RECORD_L1_L2  0x0
> +#define KRYO_ERR_RECORD_L3     0x1
> +
> +/* ARM DSU TRM Chapter B2.10 */
> +#define BUS_ERROR              0x12
> +
> +/* QCOM Kryo CPU part numbers */
> +#define KRYO3XX_GOLD           0x802
> +#define KRYO4XX_GOLD           0x804
> +#define KRYO4XX_SILVER_V1      0x803
> +#define KRYO4XX_SILVER_V2      0x805
> +
> +#define KRYO_EDAC_MSG_MAX      256
> +
> +static int poll_msec = 1000;
> +module_param(poll_msec, int, 0444);
> +
> +enum {
> +       KRYO_L1 = 0,
> +       KRYO_L2,
> +       KRYO_L3,
> +};
> +
> +/* CE = Corrected Error, UE = Uncorrected Error, DE = Deferred Error */
> +enum {

No name?

> +       KRYO_L1_CE = 0,
> +       KRYO_L1_UE,
> +       KRYO_L1_DE,
> +       KRYO_L2_CE,
> +       KRYO_L2_UE,
> +       KRYO_L2_DE,
> +       KRYO_L3_CE,
> +       KRYO_L3_UE,
> +       KRYO_L3_DE,
> +};
> +
> +struct error_record {
> +       u32 error_code;
> +       const char *error_msg;
> +};
> +
> +struct error_type {
> +       void (*fn)(struct edac_device_ctl_info *edev_ctl,
> +                  int inst_nr, int block_nr, const char *msg);
> +       const char *msg;
> +};
> +
> +/*
> + * ARM Cortex-A55, Cortex-A75, Cortex-A76 TRM Chapter B3.10
> + * ARM DSU TRM Chapter B2.10
> + */
> +static const struct error_record serror_record[] = {
> +       { 0x1,  "Errors due to fault injection"         },
> +       { 0x2,  "ECC error from internal data buffer"   },
> +       { 0x6,  "ECC error on cache data RAM"           },
> +       { 0x7,  "ECC error on cache tag or dirty RAM"   },
> +       { 0x8,  "Parity error on TLB data RAM"          },
> +       { 0x9,  "Parity error on TLB tag RAM"           },
> +       { 0x12, "Error response for a cache copyback"   },
> +       { 0x15, "Deferred error not supported"          },
> +};
> +
> +static const struct error_type err_type[] = {
> +       { edac_device_handle_ce, "Kryo L1 Corrected Error"      },
> +       { edac_device_handle_ue, "Kryo L1 Uncorrected Error"    },
> +       { edac_device_handle_ue, "Kryo L1 Deferred Error"       },
> +       { edac_device_handle_ce, "Kryo L2 Corrected Error"      },
> +       { edac_device_handle_ue, "Kryo L2 Uncorrected Error"    },
> +       { edac_device_handle_ue, "Kryo L2 Deferred Error"       },
> +       { edac_device_handle_ce, "L3 Corrected Error"           },
> +       { edac_device_handle_ue, "L3 Uncorrected Error"         },
> +       { edac_device_handle_ue, "L3 Deferred Error"            },
> +};

A comment is warranted to indicate that err_type is indexed by the
enum, as this would be easy to mess up in later changes.

> +
> +static struct edac_device_ctl_info __percpu *edac_dev;
> +static struct edac_device_ctl_info *drv_edev_ctl;
> +
> +static const char *get_error_msg(u64 errxstatus)
> +{
> +       const struct error_record *rec;
> +       u32 errxstatus_serr;
> +
> +       errxstatus_serr = FIELD_GET(KRYO_ERRXSTATUS_SERR, errxstatus);
> +
> +       for (rec = serror_record; rec->error_code; rec++) {

It looks like you expect the table to be zero terminated, but it's
not. Add the missing zero entry.

> +               if (errxstatus_serr == rec->error_code)
> +                       return rec->error_msg;
> +       }
> +
> +       return NULL;
> +}
> +
> +static void dump_syndrome_reg(int error_type, int level,
> +                             u64 errxstatus, u64 errxmisc,
> +                             struct edac_device_ctl_info *edev_ctl)
> +{
> +       char msg[KRYO_EDAC_MSG_MAX];
> +       const char *error_msg;
> +       int cpu;
> +
> +       cpu = raw_smp_processor_id();
> +
> +       error_msg = get_error_msg(errxstatus);
> +       if (!error_msg)
> +               return;
> +
> +       snprintf(msg, KRYO_EDAC_MSG_MAX,
> +                "CPU%d: %s, ERRXSTATUS_EL1:%#llx ERRXMISC0_EL1:%#llx, %s",
> +                cpu, err_type[error_type].msg, errxstatus, errxmisc,
> +                error_msg);
> +
> +       err_type[error_type].fn(edev_ctl, 0, level, msg);
> +}
> +
> +static void kryo_check_err_type(u64 errxstatus, u64 errxmisc,
> +                               struct edac_device_ctl_info *edev_ctl,
> +                               int level)
> +{
> +       u32 errxstatus_ue, errxstatus_ce, errxstatus_de;
> +
> +       errxstatus_ce = FIELD_GET(KRYO_ERRXSTATUS_CE, errxstatus);
> +       errxstatus_ue = FIELD_GET(KRYO_ERRXSTATUS_UE, errxstatus);
> +       errxstatus_de = FIELD_GET(KRYO_ERRXSTATUS_DE, errxstatus);
> +
> +       switch (level) {
> +       case KRYO_L1:
> +               if (errxstatus_ce)
> +                       dump_syndrome_reg(KRYO_L1_CE, level, errxstatus,
> +                                         errxmisc, edev_ctl);
> +               else if (errxstatus_ue)
> +                       dump_syndrome_reg(KRYO_L1_UE, level, errxstatus,
> +                                         errxmisc, edev_ctl);
> +               else if (errxstatus_de)
> +                       dump_syndrome_reg(KRYO_L1_DE, level, errxstatus,
> +                                         errxmisc, edev_ctl);
> +               else
> +                       edac_printk(KERN_ERR, DRV_NAME, "Unknown error\n");
> +               break;
> +       case KRYO_L2:
> +               if (errxstatus_ce)
> +                       dump_syndrome_reg(KRYO_L2_CE, level, errxstatus,
> +                                         errxmisc, edev_ctl);
> +               else if (errxstatus_ue)
> +                       dump_syndrome_reg(KRYO_L2_UE, level, errxstatus,
> +                                         errxmisc, edev_ctl);
> +               else if (errxstatus_de)
> +                       dump_syndrome_reg(KRYO_L2_DE, level, errxstatus,
> +                                         errxmisc, edev_ctl);
> +               else
> +                       edac_printk(KERN_ERR, DRV_NAME, "Unknown error\n");
> +               break;
> +       case KRYO_L3:
> +               if (errxstatus_ce)
> +                       dump_syndrome_reg(KRYO_L3_CE, level, errxstatus,
> +                                         errxmisc, edev_ctl);
> +               else if (errxstatus_ue)
> +                       dump_syndrome_reg(KRYO_L3_UE, level, errxstatus,
> +                                         errxmisc, edev_ctl);
> +               else if (errxstatus_de)
> +                       dump_syndrome_reg(KRYO_L3_DE, level, errxstatus,
> +                                         errxmisc, edev_ctl);
> +               else
> +                       edac_printk(KERN_ERR, DRV_NAME, "Unknown error\n");
> +               break;
> +       default:
> +               edac_printk(KERN_ERR, DRV_NAME, "Unknown level\n");
> +       }
> +}
> +
> +static inline void kryo_clear_error(u64 errxstatus)
> +{
> +       write_sysreg_s(errxstatus, SYS_ERXSTATUS_EL1);
> +       isb();

Is the isb() necessary? If so, why not a dsb as well?

> +}
> +
> +static void kryo_parse_l1_l2_cache_error(u64 errxstatus, u64 errxmisc,
> +                                        struct edac_device_ctl_info *edev_ctl,
> +                                        int cpu)
> +{
> +       u32 part_num = read_cpuid_part_number();
> +
> +       switch (part_num) {
> +       /* Kryo3XX gold CPU cores do not have a UNIT bitfield */
> +       case KRYO3XX_GOLD:
> +       case KRYO4XX_SILVER_V1:
> +       case KRYO4XX_SILVER_V2:
> +               switch (FIELD_GET(KRYO_ERRXMISC0_LVL, errxmisc)) {
> +               case KRYO_L1:
> +                       kryo_check_err_type(errxstatus, errxmisc,
> +                                           edev_ctl, KRYO_L1);
> +                       break;
> +               case KRYO_L2:
> +                       kryo_check_err_type(errxstatus, errxmisc,
> +                                           edev_ctl, KRYO_L2);
> +                       break;
> +               default:
> +                       edac_printk(KERN_ERR, DRV_NAME,
> +                                   "silver cpu:%d unknown error: %lu\n", cpu,
> +                                   FIELD_GET(KRYO_ERRXMISC0_LVL, errxmisc));
> +               }
> +               break;
> +       /* Kryo4XX gold CPU cores have a UNIT bitfield to identify levels */
> +       case KRYO4XX_GOLD:
> +               switch (FIELD_GET(KRYO_ERRXMISC0_UNIT, errxmisc)) {
> +               case KRYO_L1_UNIT_DC:
> +               case KRYO_L1_UNIT_IC:
> +                       kryo_check_err_type(errxstatus, errxmisc,
> +                                           edev_ctl, KRYO_L1);
> +                       break;
> +               case KRYO_L2_UNIT:
> +               case KRYO_L2_UNIT_TLB:
> +                       kryo_check_err_type(errxstatus, errxmisc,
> +                                           edev_ctl, KRYO_L2);
> +                       break;
> +               default:
> +                       edac_printk(KERN_ERR, DRV_NAME,
> +                                   "gold cpu:%d unknown error: %lu\n", cpu,
> +                                   FIELD_GET(KRYO_ERRXMISC0_UNIT, errxmisc));
> +               }
> +               break;
> +       default:
> +               edac_printk(KERN_ERR, DRV_NAME,
> +                           "Error in matching cpu%d with part num:%u\n",
> +                           cpu, part_num);
> +       }
> +}
> +
> +static inline bool kryo_check_regs_valid(u64 errxstatus)
> +{
> +       /* Check if status and misc regs are valid */
> +       if (!(FIELD_GET(KRYO_ERRXSTATUS_VALID, errxstatus)) ||
> +           !(FIELD_GET(KRYO_ERRXSTATUS_MV, errxstatus)))
> +               return false;
> +
> +       return true;
> +}
> +
> +static void kryo_check_l1_l2_ecc(void *info)
> +{
> +       struct edac_device_ctl_info *edev_ctl = info;
> +       u64 errxstatus;
> +       u64 errxmisc;
> +       int cpu;
> +
> +       cpu = smp_processor_id();
> +       /* We know record 0 is L1 and L2 */
> +       write_sysreg_s(0, SYS_ERRSELR_EL1);
> +       isb();

Another isb I'm not sure about. Is this meant to provide a barrier
between ERRSELR and ERXSTATUS? Wouldn't that be dsb, not isb?

> +
> +       errxstatus = read_sysreg_s(SYS_ERXSTATUS_EL1);
> +       if (!kryo_check_regs_valid(errxstatus))
> +               return;
> +
> +       errxmisc = read_sysreg_s(SYS_ERXMISC0_EL1);
> +       /* Check if L1/L2 error */
> +       if (!(FIELD_GET(KRYO_ERRXMISC0_LVL, errxmisc) == KRYO_L1) &&
> +           !(FIELD_GET(KRYO_ERRXMISC0_LVL, errxmisc) == KRYO_L2))
> +               return;
> +
> +       kryo_parse_l1_l2_cache_error(errxstatus, errxmisc, edev_ctl, cpu);
> +       kryo_clear_error(errxstatus);
> +}
> +



[Index of Archives]     [Device Tree Compilter]     [Device Tree Spec]     [Linux Driver Backports]     [Video for Linux]     [Linux USB Devel]     [Linux PCI Devel]     [Linux Audio Users]     [Linux Kernel]     [Linux SCSI]     [XFree86]     [Yosemite Backpacking]


  Powered by Linux