On Mon, 04 Mar 2024 11:15:16 +0000, Ruidong Tian <tianruidong@xxxxxxxxxxxxxxxxx> wrote: > > From: Tyler Baicar <baicar@xxxxxxxxxxxxxxxxxxxxxx> > > Add support for parsing the ARM Error Source Table and basic handling of > errors reported through both memory mapped and system register interfaces. > > Assume system register interfaces are only registered with private > peripheral interrupts (PPIs); otherwise there is no guarantee the > core handling the error is the core which took the error and has the > syndrome info in its system registers. > > All detected errors will be collected to a workqueue in irq context and > print error information later. > > Signed-off-by: Tyler Baicar <baicar@xxxxxxxxxxxxxxxxxxxxxx> > Signed-off-by: Ruidong Tian <tianruidong@xxxxxxxxxxxxxxxxx> > --- > MAINTAINERS | 11 + > arch/arm64/include/asm/ras.h | 38 ++ > drivers/acpi/arm64/Kconfig | 10 + > drivers/acpi/arm64/Makefile | 1 + > drivers/acpi/arm64/aest.c | 723 +++++++++++++++++++++++++++++++++++ > include/linux/acpi_aest.h | 91 +++++ > include/linux/cpuhotplug.h | 1 + > 7 files changed, 875 insertions(+) > create mode 100644 arch/arm64/include/asm/ras.h > create mode 100644 drivers/acpi/arm64/aest.c > create mode 100644 include/linux/acpi_aest.h > > diff --git a/MAINTAINERS b/MAINTAINERS > index 2a7a90eeec49..5df845763a9c 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -329,6 +329,17 @@ L: linux-arm-kernel@xxxxxxxxxxxxxxxxxxx (moderated for non-subscribers) > S: Maintained > F: drivers/acpi/arm64 > > +ACPI AEST > +M: Tyler Baicar <baicar@xxxxxxxxxxxxxxxxxxxxxx> > +M: Ruidong Tian <tianruidond@xxxxxxxxxxxxxxxxx> > +L: linux-acpi@xxxxxxxxxxxxxxx > +L: linux-arm-kernel@xxxxxxxxxxxxxxxxxxx > +S: Supported > +F: arch/arm64/include/asm/ras.h > +F: drivers/acpi/arm64/aest.c > +F: include/linux/acpi_aest.h > + > + > ACPI FOR RISC-V (ACPI/riscv) > M: Sunil V L <sunilvl@xxxxxxxxxxxxxxxx> > L: linux-acpi@xxxxxxxxxxxxxxx > diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h > new file mode 100644 > index 000000000000..2fb0d9741567 > --- /dev/null > +++ b/arch/arm64/include/asm/ras.h > @@ -0,0 +1,38 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#ifndef __ASM_RAS_H > +#define __ASM_RAS_H > + > +#include <linux/types.h> > +#include <linux/bits.h> > + > +#define ERR_STATUS_AV BIT(31) > +#define ERR_STATUS_V BIT(30) > +#define ERR_STATUS_UE BIT(29) > +#define ERR_STATUS_ER BIT(28) > +#define ERR_STATUS_OF BIT(27) > +#define ERR_STATUS_MV BIT(26) > +#define ERR_STATUS_CE (BIT(25) | BIT(24)) > +#define ERR_STATUS_DE BIT(23) > +#define ERR_STATUS_PN BIT(22) > +#define ERR_STATUS_UET (BIT(21) | BIT(20)) > +#define ERR_STATUS_CI BIT(19) > +#define ERR_STATUS_IERR GENMASK_ULL(15, 8) > +#define ERR_STATUS_SERR GENMASK_ULL(7, 0) All these bits need to be defined in arch/arm64/tools/sysreg as ERXSTATUS_EL1 fields. > + > +/* These bit is write-one-to-clear */ > +#define ERR_STATUS_W1TC (ERR_STATUS_AV | ERR_STATUS_V | ERR_STATUS_UE | \ > + ERR_STATUS_ER | ERR_STATUS_OF | ERR_STATUS_MV | \ > + ERR_STATUS_CE | ERR_STATUS_DE | ERR_STATUS_PN | \ > + ERR_STATUS_UET | ERR_STATUS_CI) > + > +#define RAS_REV_v1_1 0x1 What is this? We already have ID_AA64PFR1_EL1.RAS_frac. > + > +struct ras_ext_regs { > + u64 err_fr; > + u64 err_ctlr; > + u64 err_status; > + u64 err_addr; > + u64 err_misc[4]; > +}; > + > +#endif /* __ASM_RAS_H */ > diff --git a/drivers/acpi/arm64/Kconfig b/drivers/acpi/arm64/Kconfig > index b3ed6212244c..639db671c5cf 100644 > --- a/drivers/acpi/arm64/Kconfig > +++ b/drivers/acpi/arm64/Kconfig > @@ -21,3 +21,13 @@ config ACPI_AGDI > > config ACPI_APMT > bool > + > +config ACPI_AEST > + bool "ARM Error Source Table Support" > + > + help > + The Arm Error Source Table (AEST) provides details on ACPI > + extensions that enable kernel-first handling of errors in a > + system that supports the Armv8 RAS extensions. > + > + If set, the kernel will report and log hardware errors. > diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile > index 726944648c9b..5ea82b196b90 100644 > --- a/drivers/acpi/arm64/Makefile > +++ b/drivers/acpi/arm64/Makefile > @@ -6,3 +6,4 @@ obj-$(CONFIG_ACPI_APMT) += apmt.o > obj-$(CONFIG_ARM_AMBA) += amba.o > obj-y += dma.o init.o > obj-y += thermal_cpufreq.o > +obj-$(CONFIG_ACPI_AEST) += aest.o > diff --git a/drivers/acpi/arm64/aest.c b/drivers/acpi/arm64/aest.c > new file mode 100644 > index 000000000000..be0883316449 > --- /dev/null > +++ b/drivers/acpi/arm64/aest.c > @@ -0,0 +1,723 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * ARM Error Source Table Support > + * > + * Copyright (c) 2021, Ampere Computing LLC > + * Copyright (c) 2021-2024, Alibaba Group. > + */ > + > +#include <linux/acpi.h> > +#include <linux/acpi_aest.h> > +#include <linux/cpuhotplug.h> > +#include <linux/kernel.h> > +#include <linux/genalloc.h> > +#include <linux/llist.h> > +#include <acpi/actbl.h> > +#include <asm/ras.h> > + > +#undef pr_fmt > +#define pr_fmt(fmt) "ACPI AEST: " fmt > + > +#define CASE_READ_CLEAR(x, clear) \ > + case (x): { \ > + res = read_sysreg_s(SYS_##x##_EL1); \ > + if (clear) \ > + write_sysreg_s(0, SYS_##x##_EL1); \ > + break; \ > + } Please don't use macros with side effects. This is horrible to debug. Instead, *return* the value from the macro, or pass the variable you want to affect as a parameter. Also, what ensures the synchronisation of this write? How is the W1TC aspect enforced? > + > +#define CASE_WRITE(val, x) \ > + case (x): { \ > + write_sysreg_s((val), SYS_##x##_EL1); \ > + break; \ > + } > + > +static struct acpi_table_header *aest_table; > + > +static struct aest_node __percpu **ppi_data; > + > +static int *ppi_irqs; > +static u8 num_ppi; > +static u8 ppi_idx; > + > +static struct work_struct aest_work; > + > +static struct gen_pool *aest_node_pool; > +static struct llist_head aest_node_llist; > + > +/* > + * This memory pool is only to be used to save AEST node in AEST irq context. > + * Use 8 pages to save AEST node for now (~500 AEST node at most). > + */ > +#define AEST_NODE_POOLSZ (8 * PAGE_SIZE) This doesn't make sense. PAGE_SIZE is a variable concept (ranging from 4 to 64kB). What is this "~500" number coming from? If you want to store a given number of records, allocate the size you actually want. > + > +static u64 aest_sysreg_read_clear(u64 base, u32 offset, bool clear) > +{ > + u64 res; > + > + switch (offset) { > + CASE_READ_CLEAR(ERXFR, clear) > + CASE_READ_CLEAR(ERXCTLR, clear) > + CASE_READ_CLEAR(ERXSTATUS, clear) > + CASE_READ_CLEAR(ERXADDR, clear) > + CASE_READ_CLEAR(ERXMISC0, clear) > + CASE_READ_CLEAR(ERXMISC1, clear) > + CASE_READ_CLEAR(ERXMISC2, clear) > + CASE_READ_CLEAR(ERXMISC3, clear) > + default : > + res = 0; > + } > + return res; > +} > + > +static void aest_sysreg_write(u64 base, u32 offset, u64 val) > +{ > + switch (offset) { > + CASE_WRITE(val, ERXFR) > + CASE_WRITE(val, ERXCTLR) > + CASE_WRITE(val, ERXSTATUS) > + CASE_WRITE(val, ERXADDR) > + CASE_WRITE(val, ERXMISC0) > + CASE_WRITE(val, ERXMISC1) > + CASE_WRITE(val, ERXMISC2) > + CASE_WRITE(val, ERXMISC3) > + default : > + return; > + } > +} > + > +static u64 aest_iomem_read_clear(u64 base, u32 offset, bool clear) > +{ > + u64 res; > + > + res = readq((void *)(base + offset)); > + if (clear) > + writeq(0, (void *)(base + offset)); Do you need the explicit synchronisation? What ordering are you trying to guarantee? > + return res; > +} > + > +static void aest_iomem_write(u64 base, u32 offset, u64 val) > +{ > + writeq(val, (void *)(base + offset)); Same question. > +} > + > +static void aest_print(struct aest_node_llist *lnode) > +{ > + static atomic_t seqno; Uninitialised atomic? > + unsigned int curr_seqno; > + char pfx_seq[64]; Magic number? > + int index; > + struct ras_ext_regs *regs; > + > + curr_seqno = atomic_inc_return(&seqno); > + snprintf(pfx_seq, sizeof(pfx_seq), "{%u}" HW_ERR, curr_seqno); > + pr_info("%sHardware error from %s\n", pfx_seq, lnode->node_name); > + > + switch (lnode->type) { > + case ACPI_AEST_PROCESSOR_ERROR_NODE: > + pr_err("%s Error from CPU%d\n", pfx_seq, lnode->id0); > + break; > + case ACPI_AEST_MEMORY_ERROR_NODE: > + pr_err("%s Error from memory at SRAT proximity domain 0x%x\n", > + pfx_seq, lnode->id0); > + break; > + case ACPI_AEST_SMMU_ERROR_NODE: > + pr_err("%s Error from SMMU IORT node 0x%x subcomponent 0x%x\n", > + pfx_seq, lnode->id0, lnode->id1); > + break; > + case ACPI_AEST_VENDOR_ERROR_NODE: > + pr_err("%s Error from vendor hid 0x%x uid 0x%x\n", > + pfx_seq, lnode->id0, lnode->id1); > + break; > + case ACPI_AEST_GIC_ERROR_NODE: > + pr_err("%s Error from GIC type 0x%x instance 0x%x\n", > + pfx_seq, lnode->id0, lnode->id1); > + break; > + default: > + pr_err("%s Unknown AEST node type\n", pfx_seq); > + return; > + } > + > + index = lnode->index; > + regs = &lnode->regs; > + > + pr_err("%s ERR%uFR: 0x%llx\n", pfx_seq, index, regs->err_fr); > + pr_err("%s ERR%uCTRL: 0x%llx\n", pfx_seq, index, regs->err_ctlr); > + pr_err("%s ERR%uSTATUS: 0x%llx\n", pfx_seq, index, regs->err_status); > + if (regs->err_status & ERR_STATUS_AV) > + pr_err("%s ERR%uADDR: 0x%llx\n", pfx_seq, index, regs->err_addr); > + > + if (regs->err_status & ERR_STATUS_MV) { > + pr_err("%s ERR%uMISC0: 0x%llx\n", pfx_seq, index, regs->err_misc[0]); > + pr_err("%s ERR%uMISC1: 0x%llx\n", pfx_seq, index, regs->err_misc[1]); > + pr_err("%s ERR%uMISC2: 0x%llx\n", pfx_seq, index, regs->err_misc[2]); > + pr_err("%s ERR%uMISC3: 0x%llx\n", pfx_seq, index, regs->err_misc[3]); > + } > +} > + > + > +static void aest_node_pool_process(struct work_struct *__unused) > +{ > + struct llist_node *head; > + struct aest_node_llist *lnode, *tmp; > + > + head = llist_del_all(&aest_node_llist); > + if (!head) > + return; > + > + head = llist_reverse_order(head); > + llist_for_each_entry_safe(lnode, tmp, head, llnode) { > + aest_print(lnode); > + gen_pool_free(aest_node_pool, (unsigned long)lnode, > + sizeof(*lnode)); > + } > +} > + > +static int aest_node_gen_pool_add(struct aest_node *node, int index, > + struct ras_ext_regs *regs) > +{ > + struct aest_node_llist *list; > + > + if (!aest_node_pool) > + return -EINVAL; > + > + list = (void *)gen_pool_alloc(aest_node_pool, sizeof(*list)); > + if (!list) > + return -ENOMEM; > + > + list->type = node->type; > + list->node_name = node->name; > + switch (node->type) { > + case ACPI_AEST_PROCESSOR_ERROR_NODE: > + list->id0 = node->spec.processor.processor_id; > + if (node->spec.processor.flags & (ACPI_AEST_PROC_FLAG_SHARED | > + ACPI_AEST_PROC_FLAG_GLOBAL)) > + list->id0 = smp_processor_id(); > + > + list->id1 = node->spec.processor.resource_type; > + break; > + case ACPI_AEST_MEMORY_ERROR_NODE: > + list->id0 = node->spec.memory.srat_proximity_domain; > + break; > + case ACPI_AEST_SMMU_ERROR_NODE: > + list->id0 = node->spec.smmu.iort_node_reference; > + list->id1 = node->spec.smmu.subcomponent_reference; > + break; > + case ACPI_AEST_VENDOR_ERROR_NODE: > + list->id0 = node->spec.vendor.acpi_hid; > + list->id1 = node->spec.vendor.acpi_uid; > + break; > + case ACPI_AEST_GIC_ERROR_NODE: > + list->id0 = node->spec.gic.interface_type; > + list->id1 = node->spec.gic.instance_id; > + break; > + default: > + list->id0 = 0; > + list->id1 = 0; > + } > + > + memcpy(&list->regs, regs, sizeof(*regs)); You have vmalloced the record. Why do you need to copy it instead of simply pointing to it? > + list->index = index; > + llist_add(&list->llnode, &aest_node_llist); > + > + return 0; > +} > + > +static int aest_node_pool_init(void) > +{ > + unsigned long addr, size; > + int rc; > + > + if (aest_node_pool) > + return 0; > + > + aest_node_pool = gen_pool_create(ilog2(sizeof(struct aest_node_llist)), -1); > + if (!aest_node_pool) > + return -ENOMEM; > + > + size = PAGE_ALIGN(AEST_NODE_POOLSZ); > + addr = (unsigned long)vmalloc(size); > + if (!addr) > + goto err_pool_alloc; > + > + rc = gen_pool_add(aest_node_pool, addr, size, -1); > + if (rc) > + goto err_pool_add; > + > + return 0; > + > +err_pool_add: > + vfree((void *)addr); > + > +err_pool_alloc: > + gen_pool_destroy(aest_node_pool); > + > + return -ENOMEM; > +} > + > +static void aest_log(struct aest_node *node, int index, struct ras_ext_regs *regs) > +{ > + if (!aest_node_gen_pool_add(node, index, regs)) > + schedule_work(&aest_work); > +} > + > +/* > + * you must select cpu number first in order to operate RAS register belonged > + * that cpu. > + */ > +static void aest_select_cpu(struct aest_node *node, int i) > +{ > + if (node->interface.type == ACPI_AEST_NODE_SYSTEM_REGISTER) { > + write_sysreg_s(i, SYS_ERRSELR_EL1); ERRSELR_EL1 doesn't select a CPU. It selects a RAS record. How records and CPUs are mapped isn't specified in the architecture. > + isb(); > + } > +} > + > +static void aest_proc(struct aest_node *node) > +{ > + struct ras_ext_regs regs = {0}; > + struct aest_access *access; > + int i; > + u64 regs_p; > + > + > + for (i = node->interface.record_start; i < node->interface.record_end; i++) { > + /* 1b: Error record at i index is not implemented */ > + if (test_bit(i, &node->interface.record_implemented)) > + continue; > + > + aest_select_cpu(node, i); > + > + access = node->access; > + regs_p = (u64)&node->interface.regs[i]; > + > + regs.err_status = access->read_clear(regs_p, ERXSTATUS, false); > + if (!(regs.err_status & ERR_STATUS_V)) > + continue; > + > + if (regs.err_status & ERR_STATUS_AV) > + regs.err_addr = access->read_clear(regs_p, ERXADDR, false); > + > + regs.err_fr = access->read_clear(regs_p, ERXFR, false); > + regs.err_ctlr = access->read_clear(regs_p, ERXCTLR, false); > + > + if (regs.err_status & ERR_STATUS_MV) { > + bool clear = node->interface.flags & ACPI_AEST_INTERFACE_CLEAR_MISC; > + > + regs.err_misc[0] = access->read_clear(regs_p, ERXMISC0, clear); > + regs.err_misc[1] = access->read_clear(regs_p, ERXMISC1, clear); > + regs.err_misc[2] = access->read_clear(regs_p, ERXMISC2, clear); > + regs.err_misc[3] = access->read_clear(regs_p, ERXMISC3, clear); > + } > + > + aest_log(node, i, ®s); > + > + if (regs.err_status & ERR_STATUS_UE) > + panic("AEST: uncorrectable error encountered"); > + > + /* Write-one-to-clear the bits we've seen */ > + regs.err_status &= ERR_STATUS_W1TC; > + > + /* Multi bit filed need to write all-ones to clear. */ > + if (regs.err_status & ERR_STATUS_CE) > + regs.err_status |= ERR_STATUS_CE; > + > + /* Multi bit filed need to write all-ones to clear. */ > + if (regs.err_status & ERR_STATUS_UET) > + regs.err_status |= ERR_STATUS_UET; > + > + access->write(regs_p, ERXSTATUS, regs.err_status); > + } > +} > + > +static irqreturn_t aest_irq_func(int irq, void *input) > +{ > + struct aest_node *node = input; > + > + aest_proc(node); > + > + return IRQ_HANDLED; > +} > + > +static int __init aest_register_gsi(u32 gsi, int trigger, void *data, > + irq_handler_t aest_irq_func) > +{ > + int cpu, irq; > + > + irq = acpi_register_gsi(NULL, gsi, trigger, ACPI_ACTIVE_HIGH); > + > + if (irq == -EINVAL) { > + pr_err("failed to map AEST GSI %d\n", gsi); > + return -EINVAL; > + } > + > + if (gsi < 16) { > + pr_err("invalid GSI %d\n", gsi); > + return -EINVAL; > + } else if (gsi < 32) { > + if (ppi_idx >= AEST_MAX_PPI) { > + pr_err("Unable to register PPI %d\n", gsi); > + return -EINVAL; > + } > + ppi_irqs[ppi_idx] = irq; > + enable_percpu_irq(irq, IRQ_TYPE_NONE); Enabling the PPI before requesting it? Looks... great. And how does this work on a system that supports EPPIs, which are in the [1119:1056] range? Also, if you get a trigger as a parameter, why the IRQ_TYPE_NONE? > + for_each_possible_cpu(cpu) { > + memcpy(per_cpu_ptr(ppi_data[ppi_idx], cpu), data, > + sizeof(struct aest_node)); > + } > + if (request_percpu_irq(irq, aest_irq_func, "AEST", > + ppi_data[ppi_idx++])) { > + pr_err("failed to register AEST IRQ %d\n", irq); > + return -EINVAL; > + } > + } else if (gsi < 1020) { > + if (request_irq(irq, aest_irq_func, IRQF_SHARED, "AEST", > + data)) { Why SHARED? Who would share a RAS interrupt????? > + pr_err("failed to register AEST IRQ %d\n", irq); > + return -EINVAL; Same question about extended SPIs. All in all, this whole logic is totally useless. It isn't the driver's job to classify the GIC INTIDs... M. -- Without deviation from the norm, progress is not possible.