Hello, some comments below. > Subject: [PATCH v3 1/5] ACPI/RAS/AEST: Initial AEST driver > > Add support for parsing the ARM Error Source Table and basic handling of > errors reported through both memory mapped and system register interfaces. > > Assume system register interfaces are only registered with private > peripheral interrupts (PPIs); otherwise there is no guarantee the > core handling the error is the core which took the error and has the > syndrome info in its system registers. > > In kernel-first mode, all configuration is controlled by kernel, include > CE ce_threshold and interrupt enable/disable. > > All detected errors will be processed as follow: > - CE, DE: use a workqueue to log this hare errors. > - UER, UEO: log it and call memory_failun workquee. > - UC, UEU: panic in irq context. > > Signed-off-by: Tyler Baicar <baicar@xxxxxxxxxxxxxxxxxxxxxx> > Signed-off-by: Ruidong Tian <tianruidong@xxxxxxxxxxxxxxxxx> > --- > MAINTAINERS | 10 + > arch/arm64/include/asm/ras.h | 95 ++++ > drivers/acpi/arm64/Kconfig | 11 + > drivers/acpi/arm64/Makefile | 1 + > drivers/acpi/arm64/aest.c | 335 ++++++++++++ > drivers/acpi/arm64/init.c | 2 + > drivers/acpi/arm64/init.h | 1 + > drivers/ras/Kconfig | 1 + > drivers/ras/Makefile | 1 + > drivers/ras/aest/Kconfig | 17 + > drivers/ras/aest/Makefile | 5 + > drivers/ras/aest/aest-core.c | 976 +++++++++++++++++++++++++++++++++++ > drivers/ras/aest/aest.h | 323 ++++++++++++ > include/linux/acpi_aest.h | 68 +++ > include/linux/cpuhotplug.h | 1 + > include/linux/ras.h | 8 + > 16 files changed, 1855 insertions(+) > create mode 100644 arch/arm64/include/asm/ras.h > create mode 100644 drivers/acpi/arm64/aest.c > create mode 100644 drivers/ras/aest/Kconfig > create mode 100644 drivers/ras/aest/Makefile > create mode 100644 drivers/ras/aest/aest-core.c > create mode 100644 drivers/ras/aest/aest.h > create mode 100644 include/linux/acpi_aest.h > > diff --git a/MAINTAINERS b/MAINTAINERS > index 637ddd44245f..d757f9339627 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -330,6 +330,16 @@ S: Maintained > F: drivers/acpi/arm64 > F: include/linux/acpi_iort.h > > +ACPI AEST > +M: Ruidong Tian <tianruidond@xxxxxxxxxxxxxxxxx> > +L: linux-acpi@xxxxxxxxxxxxxxx > +L: linux-arm-kernel@xxxxxxxxxxxxxxxxxxx > +S: Supported > +F: arch/arm64/include/asm/ras.h > +F: drivers/acpi/arm64/aest.c > +F: drivers/ras/aest/ > +F: include/linux/acpi_aest.h > + > ACPI FOR RISC-V (ACPI/riscv) > M: Sunil V L <sunilvl@xxxxxxxxxxxxxxxx> > L: linux-acpi@xxxxxxxxxxxxxxx > diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h > new file mode 100644 > index 000000000000..7676add8a0ed > --- /dev/null > +++ b/arch/arm64/include/asm/ras.h > @@ -0,0 +1,95 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#ifndef __ASM_RAS_H > +#define __ASM_RAS_H > + > +#include <linux/types.h> > +#include <linux/bits.h> > + > +/* ERR<n>FR */ > +#define ERR_FR_CE GENMASK_ULL(54, 53) > +#define ERR_FR_RP BIT(15) > +#define ERR_FR_CEC GENMASK_ULL(14, 12) > + > +#define ERR_FR_RP_SINGLE_COUNTER 0 > +#define ERR_FR_RP_DOUBLE_COUNTER 1 > + > +#define ERR_FR_CEC_0B_COUNTER 0 > +#define ERR_FR_CEC_8B_COUNTER BIT(1) > +#define ERR_FR_CEC_16B_COUNTER BIT(2) > + > +/* ERR<n>STATUS */ > +#define ERR_STATUS_AV BIT(31) > +#define ERR_STATUS_V BIT(30) > +#define ERR_STATUS_UE BIT(29) > +#define ERR_STATUS_ER BIT(28) > +#define ERR_STATUS_OF BIT(27) > +#define ERR_STATUS_MV BIT(26) > +#define ERR_STATUS_CE (BIT(25) | BIT(24)) > +#define ERR_STATUS_DE BIT(23) > +#define ERR_STATUS_PN BIT(22) > +#define ERR_STATUS_UET (BIT(21) | BIT(20)) > +#define ERR_STATUS_CI BIT(19) > +#define ERR_STATUS_IERR GENMASK_ULL(15, 8) > +#define ERR_STATUS_SERR GENMASK_ULL(7, 0) > + > +/* Theses bits are write-one-to-clear */ > +#define ERR_STATUS_W1TC (ERR_STATUS_AV | ERR_STATUS_V | ERR_STATUS_UE | \ > + ERR_STATUS_ER | ERR_STATUS_OF | ERR_STATUS_MV | \ > + ERR_STATUS_CE | ERR_STATUS_DE | ERR_STATUS_PN | \ > + ERR_STATUS_UET | ERR_STATUS_CI) > + > +#define ERR_STATUS_UET_UC 0 > +#define ERR_STATUS_UET_UEU 1 > +#define ERR_STATUS_UET_UEO 2 > +#define ERR_STATUS_UET_UER 3 > + > +/* ERR<n>CTLR */ > +#define ERR_CTLR_CFI BIT(8) > +#define ERR_CTLR_FI BIT(3) > +#define ERR_CTLR_UI BIT(2) > + > +/* ERR<n>ADDR */ > +#define ERR_ADDR_AI BIT(61) > +#define ERR_ADDR_PADDR GENMASK_ULL(55, 0) > + > +/* ERR<n>MISC0 */ > + > +/* ERR<n>FR.CEC == 0b010, ERR<n>FR.RP == 0 */ > +#define ERR_MISC0_8B_OF BIT(39) > +#define ERR_MISC0_8B_CEC GENMASK_ULL(38, 32) > + > +/* ERR<n>FR.CEC == 0b100, ERR<n>FR.RP == 0 */ > +#define ERR_MISC0_16B_OF BIT(47) > +#define ERR_MISC0_16B_CEC GENMASK_ULL(46, 32) > + > +#define ERR_MISC0_CEC_SHIFT 31 > + > +#define ERR_8B_CEC_MAX (ERR_MISC0_8B_CEC >> ERR_MISC0_CEC_SHIFT) > +#define ERR_16B_CEC_MAX (ERR_MISC0_16B_CEC >> ERR_MISC0_CEC_SHIFT) > + > +/* ERR<n>FR.CEC == 0b100, ERR<n>FR.RP == 1 */ > +#define ERR_MISC0_16B_OFO BIT(63) > +#define ERR_MISC0_16B_CECO GENMASK_ULL(62, 48) > +#define ERR_MISC0_16B_OFR BIT(47) > +#define ERR_MISC0_16B_CECR GENMASK_ULL(46, 32) > + > +/* ERRDEVARCH */ > +#define ERRDEVARCH_REV GENMASK(19, 16) > + > +enum ras_ce_threshold { > + RAS_CE_THRESHOLD_0B, > + RAS_CE_THRESHOLD_8B, > + RAS_CE_THRESHOLD_16B, > + RAS_CE_THRESHOLD_32B, > + UNKNOWN, > +}; > + > +struct ras_ext_regs { > + u64 err_fr; > + u64 err_ctlr; > + u64 err_status; > + u64 err_addr; > + u64 err_misc[4]; > +}; > + > +#endif /* __ASM_RAS_H */ > diff --git a/drivers/acpi/arm64/Kconfig b/drivers/acpi/arm64/Kconfig > index b3ed6212244c..c8eb6de95733 100644 > --- a/drivers/acpi/arm64/Kconfig > +++ b/drivers/acpi/arm64/Kconfig > @@ -21,3 +21,14 @@ config ACPI_AGDI > > config ACPI_APMT > bool > + > +config ACPI_AEST > + bool "ARM Error Source Table Support" > + depends on ARM64_RAS_EXTN > + > + help > + The Arm Error Source Table (AEST) provides details on ACPI > + extensions that enable kernel-first handling of errors in a > + system that supports the Armv8 RAS extensions. > + > + If set, the kernel will report and log hardware errors. > diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile > index 05ecde9eaabe..8e240b281fd1 100644 > --- a/drivers/acpi/arm64/Makefile > +++ b/drivers/acpi/arm64/Makefile > @@ -6,5 +6,6 @@ obj-$(CONFIG_ACPI_GTDT) += gtdt.o > obj-$(CONFIG_ACPI_IORT) += iort.o > obj-$(CONFIG_ACPI_PROCESSOR_IDLE) += cpuidle.o > obj-$(CONFIG_ARM_AMBA) += amba.o > +obj-$(CONFIG_ACPI_AEST) += aest.o > obj-y += dma.o init.o > obj-y += thermal_cpufreq.o > diff --git a/drivers/acpi/arm64/aest.c b/drivers/acpi/arm64/aest.c > new file mode 100644 > index 000000000000..6dba9c23e04e > --- /dev/null > +++ b/drivers/acpi/arm64/aest.c > @@ -0,0 +1,335 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * ARM Error Source Table Support > + * > + * Copyright (c) 2024, Alibaba Group. > + */ > + > +#include <linux/xarray.h> > +#include <linux/platform_device.h> > +#include <linux/acpi_aest.h> > + > +#include "init.h" > + > +#undef pr_fmt > +#define pr_fmt(fmt) "ACPI AEST: " fmt > + > +static struct xarray *aest_array; > + > +static void __init aest_init_interface(struct acpi_aest_hdr *hdr, > + struct acpi_aest_node *node) > +{ > + struct acpi_aest_node_interface_header *interface; > + > + interface = ACPI_ADD_PTR(struct acpi_aest_node_interface_header, hdr, > + hdr->node_interface_offset); > + > + node->type = hdr->type; > + node->interface_hdr = interface; > + > + switch (interface->group_format) { > + case ACPI_AEST_NODE_GROUP_FORMAT_4K: { > + struct acpi_aest_node_interface_4k *interface_4k = > + (struct acpi_aest_node_interface_4k *)(interface + 1); > + > + node->common = &interface_4k->common; > + node->record_implemented = > + (unsigned long *)&interface_4k->error_record_implemented; > + node->status_reporting = > + (unsigned long *)&interface_4k->error_status_reporting; > + node->addressing_mode = > + (unsigned long *)&interface_4k->addressing_mode; > + break; > + } > + case ACPI_AEST_NODE_GROUP_FORMAT_16K: { > + struct acpi_aest_node_interface_16k *interface_16k = > + (struct acpi_aest_node_interface_16k *)(interface + 1); > + > + node->common = &interface_16k->common; > + node->record_implemented = > + (unsigned long *)interface_16k->error_record_implemented; > + node->status_reporting = > + (unsigned long *)interface_16k->error_status_reporting; > + node->addressing_mode = > + (unsigned long *)interface_16k->addressing_mode; > + break; > + } > + case ACPI_AEST_NODE_GROUP_FORMAT_64K: { > + struct acpi_aest_node_interface_64k *interface_64k = > + (struct acpi_aest_node_interface_64k *)(interface + 1); > + > + node->common = &interface_64k->common; > + node->record_implemented = > + (unsigned long *)interface_64k->error_record_implemented; > + node->status_reporting = > + (unsigned long *)interface_64k->error_status_reporting; > + node->addressing_mode = > + (unsigned long *)interface_64k->addressing_mode; > + break; > + } > + default: > + pr_err("invalid group format: %d\n", interface->group_format); > + } > + > + node->interrupt = ACPI_ADD_PTR(struct acpi_aest_node_interrupt_v2, > + hdr, hdr->node_interrupt_offset); > + > + node->interrupt_count = hdr->node_interrupt_count; > +} > + > +static int __init acpi_aest_init_node_common(struct acpi_aest_hdr *aest_hdr, > + struct acpi_aest_node *node) > +{ > + int ret; > + struct aest_hnode *hnode; > + u64 error_device_id; > + > + aest_init_interface(aest_hdr, node); > + > + error_device_id = node->common->error_node_device; I think I see a problem with this.