Re: [PATCH v0 3/4] drivers: edac: Add cache erp driver for Last Level Cache Controller (LLCC)

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Jul 25, 2018 at 10:44:56AM -0700, Venkata Narendra Kumar Gutta wrote:
> Add cache error reporting driver for single and double bit errors on
> Last Level Cache Controller (LLCC) cache. This driver takes care of
> dumping registers and add config options to enable and disable panic
> when these errors happen.
> 
> Signed-off-by: Channagoud Kadabi <ckadabi@xxxxxxxxxxxxxx>
> Signed-off-by: Venkata Narendra Kumar Gutta <vnkgutta@xxxxxxxxxxxxxx>

This SOB chain doesn't make any sense - see
Documentation/process/submitting-patches.rst

> ---
>  drivers/edac/Kconfig          |  21 ++
>  drivers/edac/Makefile         |   1 +
>  drivers/edac/qcom_llcc_edac.c | 520 ++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 542 insertions(+)
>  create mode 100644 drivers/edac/qcom_llcc_edac.c

Needs MAINTAINERS entry so that you get all the bug reports.

> diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
> index 57304b2..68518ad 100644
> --- a/drivers/edac/Kconfig
> +++ b/drivers/edac/Kconfig
> @@ -460,4 +460,25 @@ config EDAC_TI
>  	  Support for error detection and correction on the
>            TI SoCs.
>  
> +config EDAC_QCOM_LLCC
> +        depends on QCOM_LLCC
> +        tristate "QCOM EDAC Controller for LLCC Cache"

No edac driver per functional unit pls - see how altera_edac.c does it,
for example. IOW, this driver - if it cannot share/reuse any of the
existing edac drivers, it should be called qcom_edac and contain all the
Qualcomm-specific RAS features there.

> +        help
> +          Support for error detection and correction on the
> +          QCOM LLCC cache. Report errors caught by LLCC ECC
> +          mechanism.
> +
> +          For debugging issues having to do with stability and overall system
> +          health, you should probably say 'Y' here.
> +
> +config EDAC_QCOM_LLCC_PANIC_ON_UE
> +        depends on EDAC_QCOM_LLCC
> +        bool "Panic on uncorrectable errors - qcom llcc"
> +        help
> +          Forcibly cause a kernel panic if an uncorrectable error (UE) is
> +          detected. This can reduce debugging times on hardware which may be
> +          operating at voltages or frequencies outside normal specification.
> +
> +          For production builds, you should probably say 'N' here.
> +
>  endif # EDAC
> diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
> index 02b43a7..28aff28 100644
> --- a/drivers/edac/Makefile
> +++ b/drivers/edac/Makefile
> @@ -77,3 +77,4 @@ obj-$(CONFIG_EDAC_ALTERA)		+= altera_edac.o
>  obj-$(CONFIG_EDAC_SYNOPSYS)		+= synopsys_edac.o
>  obj-$(CONFIG_EDAC_XGENE)		+= xgene_edac.o
>  obj-$(CONFIG_EDAC_TI)			+= ti_edac.o
> +obj-$(CONFIG_EDAC_QCOM_LLCC)		+= qcom_llcc_edac.o
> diff --git a/drivers/edac/qcom_llcc_edac.c b/drivers/edac/qcom_llcc_edac.c
> new file mode 100644
> index 0000000..7a678b5
> --- /dev/null
> +++ b/drivers/edac/qcom_llcc_edac.c
> @@ -0,0 +1,520 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2018, The Linux Foundation. All rights reserved.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/edac.h>
> +#include <linux/of_device.h>
> +#include <linux/platform_device.h>
> +#include <linux/smp.h>
> +#include <linux/spinlock.h>
> +#include <linux/regmap.h>
> +#include <linux/interrupt.h>
> +#include <linux/soc/qcom/llcc-qcom.h>
> +#include "edac_mc.h"
> +#include "edac_device.h"
> +
> +#ifdef CONFIG_EDAC_QCOM_LLCC_PANIC_ON_UE
> +#define LLCC_ERP_PANIC_ON_UE 1
> +#else
> +#define LLCC_ERP_PANIC_ON_UE 0
> +#endif
> +
> +#define EDAC_LLCC	"qcom_llcc"
> +
> +#define TRP_SYN_REG_CNT	6
> +
> +#define DRP_SYN_REG_CNT	8
> +
> +#define LLCC_COMMON_STATUS0		0x0003000C
> +#define LLCC_LB_CNT_MASK		GENMASK(31, 28)
> +#define LLCC_LB_CNT_SHIFT		28
> +
> +/* single & Double Bit syndrome register offsets */
> +#define TRP_ECC_SB_ERR_SYN0		0x0002304C
> +#define TRP_ECC_DB_ERR_SYN0		0x00020370
> +#define DRP_ECC_SB_ERR_SYN0		0x0004204C
> +#define DRP_ECC_DB_ERR_SYN0		0x00042070
> +
> +/* Error register offsets */
> +#define TRP_ECC_ERROR_STATUS1		0x00020348
> +#define TRP_ECC_ERROR_STATUS0		0x00020344
> +#define DRP_ECC_ERROR_STATUS1		0x00042048
> +#define DRP_ECC_ERROR_STATUS0		0x00042044
> +
> +/* TRP, DRP interrupt register offsets */
> +#define DRP_INTERRUPT_STATUS		0x00041000
> +#define TRP_INTERRUPT_0_STATUS		0x00020480
> +#define DRP_INTERRUPT_CLEAR		0x00041008
> +#define DRP_ECC_ERROR_CNTR_CLEAR	0x00040004
> +#define TRP_INTERRUPT_0_CLEAR		0x00020484
> +#define TRP_ECC_ERROR_CNTR_CLEAR	0x00020440
> +
> +/* Mask and shift macros */
> +#define ECC_DB_ERR_COUNT_MASK	GENMASK(4, 0)

Align all those to the same vertical column.

> +#define ECC_DB_ERR_WAYS_MASK	GENMASK(31, 16)
> +#define ECC_DB_ERR_WAYS_SHIFT	BIT(4)
> +
> +#define ECC_SB_ERR_COUNT_MASK	GENMASK(23, 16)
> +#define ECC_SB_ERR_COUNT_SHIFT	BIT(4)
> +#define ECC_SB_ERR_WAYS_MASK	GENMASK(15, 0)
> +
> +#define SB_ECC_ERROR		BIT(0)
> +#define DB_ECC_ERROR		BIT(1)
> +
> +#define DRP_TRP_INT_CLEAR	GENMASK(1, 0)
> +#define DRP_TRP_CNT_CLEAR	GENMASK(1, 0)
> +
> +/* Config registers offsets*/
> +#define DRP_ECC_ERROR_CFG       0x00040000
> +
> +/* TRP, DRP interrupt register offsets */
> +#define CMN_INTERRUPT_0_ENABLE          0x0003001C
> +#define CMN_INTERRUPT_2_ENABLE          0x0003003C
> +#define TRP_INTERRUPT_0_ENABLE          0x00020488
> +#define DRP_INTERRUPT_ENABLE            0x0004100C
> +
> +#define SB_ERROR_THRESHOLD      0x1
> +#define SB_ERROR_THRESHOLD_SHIFT        24
> +#define SB_DB_TRP_INTERRUPT_ENABLE      0x3
> +#define TRP0_INTERRUPT_ENABLE   0x1
> +#define DRP0_INTERRUPT_ENABLE   BIT(6)
> +#define SB_DB_DRP_INTERRUPT_ENABLE      0x3
> +
> +
> +enum {
> +	LLCC_DRAM_CE = 0,
> +	LLCC_DRAM_UE,
> +	LLCC_TRAM_CE,
> +	LLCC_TRAM_UE,
> +};
> +
> +struct errors_edac {
> +	const char *msg;
> +	void (*func)(struct edac_device_ctl_info *edev_ctl,
> +				int inst_nr, int block_nr, const char *msg);
> +};
> +
> +static const struct errors_edac errors[] = {
> +	{"LLCC Data RAM correctable Error", edac_device_handle_ce},
> +	{"LLCC Data RAM uncorrectable Error", edac_device_handle_ue},
> +	{"LLCC Tag RAM correctable Error", edac_device_handle_ce},
> +	{"LLCC Tag RAM uncorrectable Error", edac_device_handle_ue},
> +};
> +
> +static int qcom_llcc_core_setup(struct regmap *llcc_bcast_regmap)
> +{
> +	u32 sb_err_threshold;
> +	int ret;
> +
> +	/* Enable TRP in instance 2 of common interrupt enable register */
> +	ret = regmap_update_bits(llcc_bcast_regmap, CMN_INTERRUPT_2_ENABLE,
> +				TRP0_INTERRUPT_ENABLE,
> +				TRP0_INTERRUPT_ENABLE);

Align arguments at the opening brace. Check the rest below too.

> +	if (ret)
> +		return ret;
> +
> +	/* Enable ECC interrupts on Tag Ram */
> +	ret = regmap_update_bits(llcc_bcast_regmap, TRP_INTERRUPT_0_ENABLE,
> +				SB_DB_TRP_INTERRUPT_ENABLE,
> +				SB_DB_TRP_INTERRUPT_ENABLE);
> +	if (ret)
> +		return ret;
> +
> +	/* Enable SB error for Data RAM */
> +	sb_err_threshold = (SB_ERROR_THRESHOLD << SB_ERROR_THRESHOLD_SHIFT);
> +	ret = regmap_write(llcc_bcast_regmap, DRP_ECC_ERROR_CFG,
> +				sb_err_threshold);
> +	if (ret)
> +		return ret;
> +
> +	/* Enable DRP in instance 2 of common interrupt enable register */
> +	ret = regmap_update_bits(llcc_bcast_regmap, CMN_INTERRUPT_2_ENABLE,
> +				DRP0_INTERRUPT_ENABLE, DRP0_INTERRUPT_ENABLE);
> +	if (ret)
> +		return ret;
> +
> +	/* Enable ECC interrupts on Data Ram */
> +	ret = regmap_write(llcc_bcast_regmap, DRP_INTERRUPT_ENABLE,
> +				SB_DB_DRP_INTERRUPT_ENABLE);
> +	return ret;
> +}
> +
> +/* Clear the error interrupt and counter registers */
> +static int qcom_llcc_clear_errors(int err_type, struct llcc_drv_data *drv)
> +{
> +	int ret = 0;
> +
> +	switch (err_type) {
> +	case LLCC_DRAM_CE:
> +	case LLCC_DRAM_UE:
> +		/* Clear the interrupt */
> +		ret = regmap_write(drv->bcast_regmap, DRP_INTERRUPT_CLEAR,
> +					DRP_TRP_INT_CLEAR);
> +		if (ret)
> +			return ret;
> +
> +		/* Clear the counters */
> +		ret = regmap_write(drv->bcast_regmap, DRP_ECC_ERROR_CNTR_CLEAR,
> +					DRP_TRP_CNT_CLEAR);
> +		if (ret)
> +			return ret;
> +		break;
> +	case LLCC_TRAM_CE:
> +	case LLCC_TRAM_UE:
> +		ret = regmap_write(drv->bcast_regmap, TRP_INTERRUPT_0_CLEAR,
> +					DRP_TRP_INT_CLEAR);
> +		if (ret)
> +			return ret;
> +
> +		ret = regmap_write(drv->bcast_regmap, TRP_ECC_ERROR_CNTR_CLEAR,
> +					DRP_TRP_CNT_CLEAR);
> +		if (ret)
> +			return ret;
> +		break;
> +	}
> +	return ret;
> +}
> +
> +/* Dump syndrome registers for tag Ram Double bit errors */
> +static int dump_trp_db_syn_reg(struct llcc_drv_data *drv, u32 bank)
> +{
> +	int i, ret;
> +	int db_err_cnt;
> +	int db_err_ways;
> +	u32 synd_reg;
> +	u32 synd_val;
> +
> +	for (i = 0; i < TRP_SYN_REG_CNT; i++) {
> +		synd_reg = TRP_ECC_DB_ERR_SYN0 + (i * 4);
> +		ret = regmap_read(drv->regmap, drv->offsets[bank] + synd_reg,
> +				&synd_val);
> +		if (ret)
> +			return ret;
> +		edac_printk(KERN_CRIT, EDAC_LLCC, "TRP_ECC_SYN%d: 0x%8x\n",
> +			i, synd_val);
> +	}
> +
> +	ret = regmap_read(drv->regmap,
> +				drv->offsets[bank] + TRP_ECC_ERROR_STATUS1,
> +				&db_err_cnt);
> +	if (ret)
> +		return ret;
> +	db_err_cnt = (db_err_cnt & ECC_DB_ERR_COUNT_MASK);
> +	edac_printk(KERN_CRIT, EDAC_LLCC, "Double-Bit error count: 0x%4x\n",
> +		db_err_cnt);
> +
> +	ret = regmap_read(drv->regmap,
> +		drv->offsets[bank] + TRP_ECC_ERROR_STATUS0, &db_err_ways);
> +	if (ret)
> +		return ret;
> +	db_err_ways = (db_err_ways & ECC_DB_ERR_WAYS_MASK);
> +	db_err_ways >>= ECC_DB_ERR_WAYS_SHIFT;
> +
> +	edac_printk(KERN_CRIT, EDAC_LLCC, "Double-Bit error ways: 0x%4x\n",
> +		db_err_ways);
> +
> +	return ret;
> +}
> +
> +/* Dump syndrome register for tag Ram Single Bit Errors */
> +static int dump_trp_sb_syn_reg(struct llcc_drv_data *drv, u32 bank)
> +{
> +	int i, ret;
> +	int sb_err_cnt;
> +	int sb_err_ways;
> +	u32 synd_reg;
> +	u32 synd_val;
> +
> +	for (i = 0; i < TRP_SYN_REG_CNT; i++) {
> +		synd_reg = TRP_ECC_SB_ERR_SYN0 + (i * 4);
> +		ret = regmap_read(drv->regmap, drv->offsets[bank] + synd_reg,
> +					&synd_val);
> +		if (ret)
> +			return ret;
> +		edac_printk(KERN_CRIT, EDAC_LLCC, "TRP_ECC_SYN%d: 0x%8x\n",
> +				i, synd_val);
> +	}
> +
> +	ret = regmap_read(drv->regmap,
> +				drv->offsets[bank] + TRP_ECC_ERROR_STATUS1,
> +				&sb_err_cnt);
> +	if (ret)
> +		return ret;
> +	sb_err_cnt = (sb_err_cnt & ECC_SB_ERR_COUNT_MASK);
> +	sb_err_cnt >>= ECC_SB_ERR_COUNT_SHIFT;
> +	edac_printk(KERN_CRIT, EDAC_LLCC, "Single-Bit error count: 0x%4x\n",
> +		sb_err_cnt);
> +
> +	ret = regmap_read(drv->regmap,
> +				drv->offsets[bank] + TRP_ECC_ERROR_STATUS0,
> +				&sb_err_ways);
> +	if (ret)
> +		return ret;
> +
> +	sb_err_ways = sb_err_ways & ECC_SB_ERR_WAYS_MASK;
> +
> +	edac_printk(KERN_CRIT, EDAC_LLCC, "Single-Bit error ways: 0x%4x\n",
> +			sb_err_ways);
> +
> +	return ret;
> +}
> +
> +/* Dump syndrome registers for Data Ram Double bit errors */
> +static int dump_drp_db_syn_reg(struct llcc_drv_data *drv, u32 bank)
> +{
> +	int i, ret;
> +	int db_err_cnt;
> +	int db_err_ways;
> +	u32 synd_reg;
> +	u32 synd_val;
> +
> +	for (i = 0; i < DRP_SYN_REG_CNT; i++) {
> +		synd_reg = DRP_ECC_DB_ERR_SYN0 + (i * 4);
> +		ret = regmap_read(drv->regmap, drv->offsets[bank] + synd_reg,
> +					&synd_val);
> +		if (ret)
> +			return ret;
> +		edac_printk(KERN_CRIT, EDAC_LLCC, "DRP_ECC_SYN%d: 0x%8x\n",
> +				i, synd_val);
> +	}
> +
> +	ret = regmap_read(drv->regmap,
> +				drv->offsets[bank] + DRP_ECC_ERROR_STATUS1,
> +				&db_err_cnt);
> +	if (ret)
> +		return ret;
> +	db_err_cnt = (db_err_cnt & ECC_DB_ERR_COUNT_MASK);
> +	edac_printk(KERN_CRIT, EDAC_LLCC, "Double-Bit error count: 0x%4x\n",
> +		db_err_cnt);
> +
> +	ret = regmap_read(drv->regmap,
> +				drv->offsets[bank] + DRP_ECC_ERROR_STATUS0,
> +				&db_err_ways);
> +	if (ret)
> +		return ret;
> +	db_err_ways &= ECC_DB_ERR_WAYS_MASK;
> +	db_err_ways >>= ECC_DB_ERR_WAYS_SHIFT;
> +	edac_printk(KERN_CRIT, EDAC_LLCC, "Double-Bit error ways: 0x%4x\n",
> +		db_err_ways);
> +
> +	return ret;
> +}
> +
> +/* Dump Syndrome registers for Data Ram Single bit errors*/
> +static int dump_drp_sb_syn_reg(struct llcc_drv_data *drv, u32 bank)
> +{
> +	int i, ret;
> +	int sb_err_cnt;
> +	int sb_err_ways;
> +	u32 synd_reg;
> +	u32 synd_val;
> +
> +	for (i = 0; i < DRP_SYN_REG_CNT; i++) {
> +		synd_reg = DRP_ECC_SB_ERR_SYN0 + (i * 4);
> +		ret = regmap_read(drv->regmap, drv->offsets[bank] + synd_reg,
> +					&synd_val);
> +		if (ret)
> +			return ret;
> +		edac_printk(KERN_CRIT, EDAC_LLCC, "DRP_ECC_SYN%d: 0x%8x\n",
> +				i, synd_val);
> +	}
> +
> +	ret = regmap_read(drv->regmap,
> +				drv->offsets[bank] + DRP_ECC_ERROR_STATUS1,
> +				&sb_err_cnt);
> +	if (ret)
> +		return ret;
> +	sb_err_cnt &= ECC_SB_ERR_COUNT_MASK;
> +	sb_err_cnt >>= ECC_SB_ERR_COUNT_SHIFT;
> +	edac_printk(KERN_CRIT, EDAC_LLCC, "Single-Bit error count: 0x%4x\n",
> +		sb_err_cnt);
> +
> +	ret = regmap_read(drv->regmap,
> +				drv->offsets[bank] + DRP_ECC_ERROR_STATUS0,
> +				&sb_err_ways);
> +	if (ret)
> +		return ret;
> +	sb_err_ways = sb_err_ways & ECC_SB_ERR_WAYS_MASK;
> +
> +	edac_printk(KERN_CRIT, EDAC_LLCC, "Single-Bit error ways: 0x%4x\n",
> +		sb_err_ways);
> +
> +	return ret;
> +}
> +
> +

one newline is enough.

> +static int dump_syn_reg(struct edac_device_ctl_info *edev_ctl,
> +			 int err_type, u32 bank)
> +{
> +	int ret = 0;
> +	struct llcc_drv_data *drv = edev_ctl->pvt_info;
> +
> +	switch (err_type) {
> +	case LLCC_DRAM_CE:
> +		ret = dump_drp_sb_syn_reg(drv, bank);
> +		break;
> +	case LLCC_DRAM_UE:
> +		ret = dump_drp_db_syn_reg(drv, bank);
> +		break;
> +	case LLCC_TRAM_CE:
> +		ret = dump_trp_sb_syn_reg(drv, bank);
> +		break;
> +	case LLCC_TRAM_UE:
> +		ret = dump_trp_db_syn_reg(drv, bank);
> +		break;
> +	}
> +	if (ret)
> +		return ret;
> +
> +	ret = qcom_llcc_clear_errors(err_type, drv);
> +	if (ret)
> +		return ret;
> +
> +	errors[err_type].func(edev_ctl, 0, bank, errors[err_type].msg);
> +
> +	return ret;
> +}
> +
> +static irqreturn_t qcom_llcc_check_cache_errors
> +		(struct edac_device_ctl_info *edev_ctl)

Please don't split the function name from the args.

static irqreturn_t
qcom_llcc_check_cache_errors(struct edac_device_ctl_info *edev_ctl)

is a bit better, for example.

> +{
> +	int ret;
> +	u32 drp_error;
> +	u32 trp_error;
> +	struct llcc_drv_data *drv = edev_ctl->pvt_info;
> +	u32 i;
> +	irqreturn_t irq_rc = IRQ_NONE;
> +
> +	for (i = 0; i < drv->num_banks; i++) {
> +		/* Look for Data RAM errors */
> +		ret = regmap_read(drv->regmap,
> +				drv->offsets[i] + DRP_INTERRUPT_STATUS,
> +				&drp_error);
> +		if (ret)
> +			return irq_rc;
> +
> +		if (drp_error & SB_ECC_ERROR) {
> +			edac_printk(KERN_CRIT, EDAC_LLCC,
> +				"Single Bit Error detected in Data Ram\n");
> +			dump_syn_reg(edev_ctl, LLCC_DRAM_CE, i);
> +			irq_rc = IRQ_HANDLED;
> +		} else if (drp_error & DB_ECC_ERROR) {
> +			edac_printk(KERN_CRIT, EDAC_LLCC,
> +				"Double Bit Error detected in Data Ram\n");
> +			dump_syn_reg(edev_ctl, LLCC_DRAM_UE, i);
> +			irq_rc = IRQ_HANDLED;
> +		}
> +
> +		/* Look for Tag RAM errors */
> +		ret = regmap_read(drv->regmap,
> +				drv->offsets[i] + TRP_INTERRUPT_0_STATUS,
> +				&trp_error);
> +		if (ret)
> +			return irq_rc;
> +		if (trp_error & SB_ECC_ERROR) {
> +			edac_printk(KERN_CRIT, EDAC_LLCC,
> +				"Single Bit Error detected in Tag Ram\n");
> +			dump_syn_reg(edev_ctl, LLCC_TRAM_CE, i);
> +			irq_rc = IRQ_HANDLED;
> +		} else if (trp_error & DB_ECC_ERROR) {
> +			edac_printk(KERN_CRIT, EDAC_LLCC,
> +				"Double Bit Error detected in Tag Ram\n");
> +			dump_syn_reg(edev_ctl, LLCC_TRAM_UE, i);
> +			irq_rc = IRQ_HANDLED;
> +		}
> +	}
> +
> +	return irq_rc;
> +}
> +
> +static irqreturn_t llcc_ecc_irq_handler
> +			(int irq, void *edev_ctl)

That looks like a useless wrapper, get rid of it.

> +{
> +	return qcom_llcc_check_cache_errors(edev_ctl);
> +}
> +
> +static int qcom_llcc_erp_probe(struct platform_device *pdev)
> +{
> +	int rc;
> +	u32 ecc_irq;
> +	struct edac_device_ctl_info *edev_ctl;
> +	struct device *dev = &pdev->dev;
> +	struct llcc_drv_data *llcc_driv_data = pdev->dev.platform_data;

Please sort function local variables declaration in a reverse christmas
tree order:

	<type> longest_variable_name;
	<type> shorter_var_name;
	<type> even_shorter;
	<type> i;

Ditto for the other functions.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.
--
--
To unsubscribe from this list: send the line "unsubscribe linux-soc" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux