From: Shiju Jose <shiju.jose@xxxxxxxxxx> This patch adds reporting ECC errors in the SAS V2 driver to userspace as non-standard trace events. rasdaemon can be used to read and log these ECC errors in userspace. Rasdaemon log for the SAS errors with the decoding sample: cpu 00:[ 70.025830] hisi_sas_v2_hw HISI0162:01: phy7, wait tx fifo need send break <idle>-0 [4204528] 0.000007: non_standard_event: 2017-09-06 11:14:49 +0000 Recoverable section type: daffd814-6eba-4d8c-8a91-bc9bbf4aa301 fru text: HISI0162:01 fru id: 00000000-0000-0000-0000-000000000000 length: 24 error: 00000000: 00000007 00000000 0000013c 00000000 00000010: 00000000 00000001 HISI HIP07: SAS error: [phy addr = 0x0x13c: single-bit ecc: error type = hgc_dqe ecc] cpu 00: <idle>-0 [4204552] 0.000007: non_standard_event: 2017-09-06 11:14:49 +0000 Fatal section type: daffd814-6eba-4d8c-8a91-bc9bbf4aa301 fru text: HISI0162:01 fru id: 00000000-0000-0000-0000-000000000000 length: 24 error: 00000000: 00000007 00000000 0000013c 00000000 00000010: 00000001 00000001 HISI HIP07: SAS error: [phy addr = 0x0x13c: multi-bit ecc: error type = hgc_dqe ecc] Signed-off-by: Shiju Jose <shiju.jose@xxxxxxxxxx> Signed-off-by: John Garry <john.garry@xxxxxxxxxx> --- drivers/scsi/hisi_sas/hisi_sas.h | 9 ++++ drivers/scsi/hisi_sas/hisi_sas_v2_hw.c | 95 +++++++++++++++++++++++++++++++++- 2 files changed, 102 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas.h b/drivers/scsi/hisi_sas/hisi_sas.h index d2d384b..58bc69e 100644 --- a/drivers/scsi/hisi_sas/hisi_sas.h +++ b/drivers/scsi/hisi_sas/hisi_sas.h @@ -12,6 +12,7 @@ #ifndef _HISI_SAS_H_ #define _HISI_SAS_H_ +#include <acpi/ghes.h> #include <linux/acpi.h> #include <linux/clk.h> #include <linux/dmapool.h> @@ -22,7 +23,9 @@ #include <linux/pci.h> #include <linux/platform_device.h> #include <linux/property.h> +#include <linux/ras.h> #include <linux/regmap.h> +#include <ras/ras_event.h> #include <scsi/sas_ata.h> #include <scsi/libsas.h> @@ -96,9 +99,15 @@ struct hisi_sas_hw_error { int shift; const char *msg; int reg; + u32 type; const struct hisi_sas_hw_error *sub; }; +enum hisi_sas_bit_err_type { + HISI_SAS_ERR_SINGLE_BIT_ECC = 0x0, + HISI_SAS_ERR_MULTI_BIT_ECC = 0x1, +}; + struct hisi_sas_phy { struct hisi_hba *hisi_hba; struct hisi_sas_port *port; diff --git a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c index ee34f2e..0cf8244 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c @@ -379,6 +379,17 @@ #define HISI_SAS_FATAL_INT_NR 2 +#define HISI_SAS_ECC_ERR_HGC_DQE BIT(0) +#define HISI_SAS_ECC_ERR_HGC_IOST BIT(1) +#define HISI_SAS_ECC_ERR_HGC_ITCT BIT(2) +#define HISI_SAS_ECC_ERR_HGC_IOSTLIST BIT(3) +#define HISI_SAS_ECC_ERR_HGC_ITCTLIST BIT(4) +#define HISI_SAS_ECC_ERR_HGC_CQE BIT(5) +#define HISI_SAS_ECC_ERR_HGC_RXM_MEM0 BIT(6) +#define HISI_SAS_ECC_ERR_HGC_RXM_MEM1 BIT(7) +#define HISI_SAS_ECC_ERR_HGC_RXM_MEM2 BIT(8) +#define HISI_SAS_ECC_ERR_HGC_RXM_MEM3 BIT(9) + struct hisi_sas_complete_v2_hdr { __le32 dw0; __le32 dw1; @@ -401,6 +412,13 @@ struct hisi_sas_err_record_v2 { __le32 dma_rx_err_type; }; +struct hisi_sas_hw_err_info { + u64 validation_bits; + u64 physical_addr; + u32 mb_err; + u32 type; +}; + static const struct hisi_sas_hw_error one_bit_ecc_errors[] = { { .irq_msk = BIT(SAS_ECC_INTR_DQE_ECC_1B_OFF), @@ -408,6 +426,7 @@ struct hisi_sas_err_record_v2 { .shift = HGC_DQE_ECC_1B_ADDR_OFF, .msg = "hgc_dqe_acc1b_intr found: Ram address is 0x%08X\n", .reg = HGC_DQE_ECC_ADDR, + .type = HISI_SAS_ECC_ERR_HGC_DQE, }, { .irq_msk = BIT(SAS_ECC_INTR_IOST_ECC_1B_OFF), @@ -415,6 +434,7 @@ struct hisi_sas_err_record_v2 { .shift = HGC_IOST_ECC_1B_ADDR_OFF, .msg = "hgc_iost_acc1b_intr found: Ram address is 0x%08X\n", .reg = HGC_IOST_ECC_ADDR, + .type = HISI_SAS_ECC_ERR_HGC_IOST, }, { .irq_msk = BIT(SAS_ECC_INTR_ITCT_ECC_1B_OFF), @@ -422,6 +442,7 @@ struct hisi_sas_err_record_v2 { .shift = HGC_ITCT_ECC_1B_ADDR_OFF, .msg = "hgc_itct_acc1b_intr found: am address is 0x%08X\n", .reg = HGC_ITCT_ECC_ADDR, + .type = HISI_SAS_ECC_ERR_HGC_ITCT, }, { .irq_msk = BIT(SAS_ECC_INTR_IOSTLIST_ECC_1B_OFF), @@ -429,6 +450,7 @@ struct hisi_sas_err_record_v2 { .shift = HGC_LM_DFX_STATUS2_IOSTLIST_OFF, .msg = "hgc_iostl_acc1b_intr found: memory address is 0x%08X\n", .reg = HGC_LM_DFX_STATUS2, + .type = HISI_SAS_ECC_ERR_HGC_IOSTLIST, }, { .irq_msk = BIT(SAS_ECC_INTR_ITCTLIST_ECC_1B_OFF), @@ -436,6 +458,7 @@ struct hisi_sas_err_record_v2 { .shift = HGC_LM_DFX_STATUS2_ITCTLIST_OFF, .msg = "hgc_itctl_acc1b_intr found: memory address is 0x%08X\n", .reg = HGC_LM_DFX_STATUS2, + .type = HISI_SAS_ECC_ERR_HGC_ITCTLIST, }, { .irq_msk = BIT(SAS_ECC_INTR_CQE_ECC_1B_OFF), @@ -443,6 +466,7 @@ struct hisi_sas_err_record_v2 { .shift = HGC_CQE_ECC_1B_ADDR_OFF, .msg = "hgc_cqe_acc1b_intr found: Ram address is 0x%08X\n", .reg = HGC_CQE_ECC_ADDR, + .type = HISI_SAS_ECC_ERR_HGC_CQE, }, { .irq_msk = BIT(SAS_ECC_INTR_NCQ_MEM0_ECC_1B_OFF), @@ -450,6 +474,7 @@ struct hisi_sas_err_record_v2 { .shift = HGC_RXM_DFX_STATUS14_MEM0_OFF, .msg = "rxm_mem0_acc1b_intr found: memory address is 0x%08X\n", .reg = HGC_RXM_DFX_STATUS14, + .type = HISI_SAS_ECC_ERR_HGC_RXM_MEM0, }, { .irq_msk = BIT(SAS_ECC_INTR_NCQ_MEM1_ECC_1B_OFF), @@ -457,6 +482,7 @@ struct hisi_sas_err_record_v2 { .shift = HGC_RXM_DFX_STATUS14_MEM1_OFF, .msg = "rxm_mem1_acc1b_intr found: memory address is 0x%08X\n", .reg = HGC_RXM_DFX_STATUS14, + .type = HISI_SAS_ECC_ERR_HGC_RXM_MEM1, }, { .irq_msk = BIT(SAS_ECC_INTR_NCQ_MEM2_ECC_1B_OFF), @@ -464,6 +490,7 @@ struct hisi_sas_err_record_v2 { .shift = HGC_RXM_DFX_STATUS14_MEM2_OFF, .msg = "rxm_mem2_acc1b_intr found: memory address is 0x%08X\n", .reg = HGC_RXM_DFX_STATUS14, + .type = HISI_SAS_ECC_ERR_HGC_RXM_MEM2, }, { .irq_msk = BIT(SAS_ECC_INTR_NCQ_MEM3_ECC_1B_OFF), @@ -471,6 +498,7 @@ struct hisi_sas_err_record_v2 { .shift = HGC_RXM_DFX_STATUS15_MEM3_OFF, .msg = "rxm_mem3_acc1b_intr found: memory address is 0x%08X\n", .reg = HGC_RXM_DFX_STATUS15, + .type = HISI_SAS_ECC_ERR_HGC_RXM_MEM3, }, }; @@ -481,6 +509,7 @@ struct hisi_sas_err_record_v2 { .shift = HGC_DQE_ECC_MB_ADDR_OFF, .msg = "hgc_dqe_accbad_intr (0x%x) found: Ram address is 0x%08X\n", .reg = HGC_DQE_ECC_ADDR, + .type = HISI_SAS_ECC_ERR_HGC_DQE, }, { .irq_msk = BIT(SAS_ECC_INTR_IOST_ECC_MB_OFF), @@ -488,6 +517,7 @@ struct hisi_sas_err_record_v2 { .shift = HGC_IOST_ECC_MB_ADDR_OFF, .msg = "hgc_iost_accbad_intr (0x%x) found: Ram address is 0x%08X\n", .reg = HGC_IOST_ECC_ADDR, + .type = HISI_SAS_ECC_ERR_HGC_IOST, }, { .irq_msk = BIT(SAS_ECC_INTR_ITCT_ECC_MB_OFF), @@ -495,6 +525,7 @@ struct hisi_sas_err_record_v2 { .shift = HGC_ITCT_ECC_MB_ADDR_OFF, .msg = "hgc_itct_accbad_intr (0x%x) found: Ram address is 0x%08X\n", .reg = HGC_ITCT_ECC_ADDR, + .type = HISI_SAS_ECC_ERR_HGC_ITCT, }, { .irq_msk = BIT(SAS_ECC_INTR_IOSTLIST_ECC_MB_OFF), @@ -502,6 +533,7 @@ struct hisi_sas_err_record_v2 { .shift = HGC_LM_DFX_STATUS2_IOSTLIST_OFF, .msg = "hgc_iostl_accbad_intr (0x%x) found: memory address is 0x%08X\n", .reg = HGC_LM_DFX_STATUS2, + .type = HISI_SAS_ECC_ERR_HGC_IOSTLIST, }, { .irq_msk = BIT(SAS_ECC_INTR_ITCTLIST_ECC_MB_OFF), @@ -509,6 +541,7 @@ struct hisi_sas_err_record_v2 { .shift = HGC_LM_DFX_STATUS2_ITCTLIST_OFF, .msg = "hgc_itctl_accbad_intr (0x%x) found: memory address is 0x%08X\n", .reg = HGC_LM_DFX_STATUS2, + .type = HISI_SAS_ECC_ERR_HGC_ITCTLIST, }, { .irq_msk = BIT(SAS_ECC_INTR_CQE_ECC_MB_OFF), @@ -516,6 +549,7 @@ struct hisi_sas_err_record_v2 { .shift = HGC_CQE_ECC_MB_ADDR_OFF, .msg = "hgc_cqe_accbad_intr (0x%x) found: Ram address is 0x%08X\n", .reg = HGC_CQE_ECC_ADDR, + .type = HISI_SAS_ECC_ERR_HGC_CQE, }, { .irq_msk = BIT(SAS_ECC_INTR_NCQ_MEM0_ECC_MB_OFF), @@ -523,6 +557,7 @@ struct hisi_sas_err_record_v2 { .shift = HGC_RXM_DFX_STATUS14_MEM0_OFF, .msg = "rxm_mem0_accbad_intr (0x%x) found: memory address is 0x%08X\n", .reg = HGC_RXM_DFX_STATUS14, + .type = HISI_SAS_ECC_ERR_HGC_RXM_MEM0, }, { .irq_msk = BIT(SAS_ECC_INTR_NCQ_MEM1_ECC_MB_OFF), @@ -530,6 +565,7 @@ struct hisi_sas_err_record_v2 { .shift = HGC_RXM_DFX_STATUS14_MEM1_OFF, .msg = "rxm_mem1_accbad_intr (0x%x) found: memory address is 0x%08X\n", .reg = HGC_RXM_DFX_STATUS14, + .type = HISI_SAS_ECC_ERR_HGC_RXM_MEM1, }, { .irq_msk = BIT(SAS_ECC_INTR_NCQ_MEM2_ECC_MB_OFF), @@ -537,6 +573,7 @@ struct hisi_sas_err_record_v2 { .shift = HGC_RXM_DFX_STATUS14_MEM2_OFF, .msg = "rxm_mem2_accbad_intr (0x%x) found: memory address is 0x%08X\n", .reg = HGC_RXM_DFX_STATUS14, + .type = HISI_SAS_ECC_ERR_HGC_RXM_MEM2, }, { .irq_msk = BIT(SAS_ECC_INTR_NCQ_MEM3_ECC_MB_OFF), @@ -544,6 +581,7 @@ struct hisi_sas_err_record_v2 { .shift = HGC_RXM_DFX_STATUS15_MEM3_OFF, .msg = "rxm_mem3_accbad_intr (0x%x) found: memory address is 0x%08X\n", .reg = HGC_RXM_DFX_STATUS15, + .type = HISI_SAS_ECC_ERR_HGC_RXM_MEM3, }, }; @@ -702,6 +740,15 @@ enum { #define DIR_TO_DEVICE 2 #define DIR_RESERVED 3 +/* Vendor specific CPER SEC TYPE for HISI SAS Memory errors */ +#define CPER_SEC_TYPE_HISI_SAS \ + UUID_LE(0xDAFFD814, 0x6EBA, 0x4D8C, 0x8A, 0x91, 0xBC, 0x9B, \ + 0xBF, 0x4A, 0xA3, 0x01) + +#define HISI_SAS_VALID_PA BIT(0) +#define HISI_SAS_VALID_MB_ERR BIT(1) +#define HISI_SAS_VALID_ERR_TYPE BIT(2) + #define ERR_ON_TX_PHASE(err_phase) (err_phase == 0x2 || \ err_phase == 0x4 || err_phase == 0x8 ||\ err_phase == 0x6 || err_phase == 0xa) @@ -2882,6 +2929,17 @@ static irqreturn_t int_chnl_int_v2_hw(int irq_no, void *p) const struct hisi_sas_hw_error *ecc_error; u32 val; int i; + struct hisi_sas_hw_err_info err_data; + bool trace_ns_event_enabled = trace_non_standard_event_enabled(); + + if (trace_ns_event_enabled) { + memset(&err_data, 0, sizeof(err_data)); + err_data.validation_bits = + HISI_SAS_VALID_PA | + HISI_SAS_VALID_MB_ERR | + HISI_SAS_VALID_ERR_TYPE; + err_data.mb_err = HISI_SAS_ERR_SINGLE_BIT_ECC; + } for (i = 0; i < ARRAY_SIZE(one_bit_ecc_errors); i++) { ecc_error = &one_bit_ecc_errors[i]; @@ -2889,7 +2947,18 @@ static irqreturn_t int_chnl_int_v2_hw(int irq_no, void *p) val = hisi_sas_read32(hisi_hba, ecc_error->reg); val &= ecc_error->msk; val >>= ecc_error->shift; - dev_warn(dev, ecc_error->msg, val); + if (trace_ns_event_enabled) { + err_data.physical_addr = val; + err_data.type = ecc_error->type; + log_non_standard_event(&CPER_SEC_TYPE_HISI_SAS, + &NULL_UUID_LE, + dev_name(dev), + GHES_SEV_RECOVERABLE, + (const u8 *)&err_data, + sizeof(err_data)); + } else { + dev_warn(dev, ecc_error->msg, val); + } } } } @@ -2901,6 +2970,17 @@ static void multi_bit_ecc_error_process_v2_hw(struct hisi_hba *hisi_hba, const struct hisi_sas_hw_error *ecc_error; u32 val; int i; + struct hisi_sas_hw_err_info err_data; + bool trace_ns_event_enabled = trace_non_standard_event_enabled(); + + if (trace_ns_event_enabled) { + memset(&err_data, 0, sizeof(err_data)); + err_data.validation_bits = + HISI_SAS_VALID_PA | + HISI_SAS_VALID_MB_ERR | + HISI_SAS_VALID_ERR_TYPE; + err_data.mb_err = HISI_SAS_ERR_MULTI_BIT_ECC; + } for (i = 0; i < ARRAY_SIZE(multi_bit_ecc_errors); i++) { ecc_error = &multi_bit_ecc_errors[i]; @@ -2908,7 +2988,18 @@ static void multi_bit_ecc_error_process_v2_hw(struct hisi_hba *hisi_hba, val = hisi_sas_read32(hisi_hba, ecc_error->reg); val &= ecc_error->msk; val >>= ecc_error->shift; - dev_warn(dev, ecc_error->msg, irq_value, val); + if (trace_ns_event_enabled) { + err_data.physical_addr = val; + err_data.type = ecc_error->type; + log_non_standard_event(&CPER_SEC_TYPE_HISI_SAS, + &NULL_UUID_LE, + dev_name(dev), + GHES_SEV_PANIC, + (const u8 *)&err_data, + sizeof(err_data)); + } else { + dev_warn(dev, ecc_error->msg, irq_value, val); + } queue_work(hisi_hba->wq, &hisi_hba->rst_work); } } -- 1.9.1