On Wed, 19 Jun 2024 00:53:10 +0800 Shiyang Ruan <ruansy.fnst@xxxxxxxxxxx> wrote: > Background: > Since CXL device is a memory device, while CPU consumes a poison page of > CXL device, it always triggers a MCE by interrupt (INT18), no matter > which-First path is configured. This is the first report. Then > currently, in FW-First path, the poison event is transferred according > to the following process: CXL device -> firmware -> OS:ACPI->APEI->GHES > -> CPER -> trace report. This is the second one. These two reports > are indicating the same poisoning page, which is the so-called "duplicate > report"[1]. And the memory_failure() handling I'm trying to add in > OS-First path could also be another duplicate report. > > Hope the flow below could make it easier to understand: > CPU accesses bad memory on CXL device, then > -> MCE (INT18), *always* report (1) > -> * FW-First (implemented now) > -> CXL device -> FW > -> OS:ACPI->APEI->GHES->CPER -> trace report (2.a) > * OS-First (not implemented yet, I'm working on it) > -> CXL device -> MSI > -> OS:CXL driver -> memory_failure() (2.b) > so, the (1) and (2.a/b) are duplicated. > > (I didn't get response in my reply for [1] while I have to make patch to > solve this problem, so please correct me if my understanding is wrong.) > > This patch adds a new notifier_block and MCE_PRIO_CXL, for CXL memdev > to check whether the current poison page has been reported (if yes, > stop the notifier chain, won't call the following memory_failure() > to report), into `x86_mce_decoder_chain`. In this way, if the poison > page already handled(recorded and reported) in (1) or (2), the other one > won't duplicate the report. The record could be clear when > cxl_clear_poison() is called. > > [1] https://lore.kernel.org/linux-cxl/664d948fb86f0_e8be294f8@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.notmuch/ > > Signed-off-by: Shiyang Ruan <ruansy.fnst@xxxxxxxxxxx> So poison can be cleared in a number of ways and a CXL poison clear command is unfortunately only one of them. Some architectures have instructions that guarantee to write a whole cacheline and can clear things as well. I believe x86 does for starters. +CC linux-edac and related maintainers / reviewers. linux-mm and hwpoison maintainer. So I think this needs a more general solution that encompasses more general cleanup of poison. Trivial comments inline. Jonathan > --- > arch/x86/include/asm/mce.h | 1 + > drivers/cxl/core/mbox.c | 130 +++++++++++++++++++++++++++++++++++++ > drivers/cxl/core/memdev.c | 6 +- > drivers/cxl/cxlmem.h | 3 + > 4 files changed, 139 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h > index dfd2e9699bd7..d8109c48e7d9 100644 > --- a/arch/x86/include/asm/mce.h > +++ b/arch/x86/include/asm/mce.h > @@ -182,6 +182,7 @@ enum mce_notifier_prios { > MCE_PRIO_NFIT, > MCE_PRIO_EXTLOG, > MCE_PRIO_UC, > + MCE_PRIO_CXL, > MCE_PRIO_EARLY, > MCE_PRIO_CEC, > MCE_PRIO_HIGHEST = MCE_PRIO_CEC > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c > index 2626f3fff201..0eb3c5401e81 100644 > --- a/drivers/cxl/core/mbox.c > +++ b/drivers/cxl/core/mbox.c > @@ -4,6 +4,8 @@ > #include <linux/debugfs.h> > #include <linux/ktime.h> > #include <linux/mutex.h> > +#include <linux/notifier.h> > +#include <asm/mce.h> > #include <asm/unaligned.h> > #include <cxlpci.h> > #include <cxlmem.h> > @@ -880,6 +882,9 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd, > if (cxlr) > hpa = cxl_trace_hpa(cxlr, cxlmd, dpa); > > + if (hpa != ULLONG_MAX && cxl_mce_recorded(hpa)) > + return; > + > if (event_type == CXL_CPER_EVENT_GEN_MEDIA) > trace_cxl_general_media(cxlmd, type, cxlr, hpa, > &evt->gen_media); > @@ -1408,6 +1413,127 @@ int cxl_poison_state_init(struct cxl_memdev_state *mds) > } > EXPORT_SYMBOL_NS_GPL(cxl_poison_state_init, CXL); > > +struct cxl_mce_record { > + struct list_head node; > + u64 hpa; > +}; > +LIST_HEAD(cxl_mce_records); > +DEFINE_MUTEX(cxl_mce_mutex); > + > +bool cxl_mce_recorded(u64 hpa) > +{ > + struct cxl_mce_record *cur, *next, *rec; > + int rc; > + > + rc = mutex_lock_interruptible(&cxl_mce_mutex); guard(mutex)(&cxl_mce_muted); > + if (rc) > + return false; > + > + list_for_each_entry_safe(cur, next, &cxl_mce_records, node) { > + if (cur->hpa == hpa) { > + mutex_unlock(&cxl_mce_mutex); > + return true; > + } > + } > + > + rec = kmalloc(sizeof(struct cxl_mce_record), GFP_KERNEL); > + rec->hpa = hpa; > + list_add(&cxl_mce_records, &rec->node); > + > + mutex_unlock(&cxl_mce_mutex); > + > + return false; > +} > + > +void cxl_mce_clear(u64 hpa) > +{ > + struct cxl_mce_record *cur, *next; > + int rc; > + > + rc = mutex_lock_interruptible(&cxl_mce_mutex); Maybe cond_guard(). > + if (rc) > + return; > + > + list_for_each_entry_safe(cur, next, &cxl_mce_records, node) { > + if (cur->hpa == hpa) { > + list_del(&cur->node); > + break; > + } > + } > + > + mutex_unlock(&cxl_mce_mutex); > +} > + > +struct cxl_contains_hpa_context { > + bool contains; > + u64 hpa; > +}; > + > +static int __cxl_contains_hpa(struct device *dev, void *arg) > +{ > + struct cxl_contains_hpa_context *ctx = arg; > + struct cxl_endpoint_decoder *cxled; > + struct range *range; > + u64 hpa = ctx->hpa; > + > + if (!is_endpoint_decoder(dev)) > + return 0; > + > + cxled = to_cxl_endpoint_decoder(dev); > + range = &cxled->cxld.hpa_range; > + > + if (range->start <= hpa && hpa <= range->end) { > + ctx->contains = true; > + return 1; > + } > + > + return 0; > +} > + > +static bool cxl_contains_hpa(const struct cxl_memdev *cxlmd, u64 hpa) > +{ > + struct cxl_contains_hpa_context ctx = { > + .contains = false, > + .hpa = hpa, > + }; > + struct cxl_port *port; > + > + port = cxlmd->endpoint; > + if (port && is_cxl_endpoint(port) && cxl_num_decoders_committed(port)) > + device_for_each_child(&port->dev, &ctx, __cxl_contains_hpa); > + > + return ctx.contains; > +} > + > +static int cxl_handle_mce(struct notifier_block *nb, unsigned long val, > + void *data) > +{ > + struct mce *mce = (struct mce *)data; > + struct cxl_memdev_state *mds = container_of(nb, struct cxl_memdev_state, > + mce_notifier); > + u64 hpa; > + > + if (!mce || !mce_usable_address(mce)) > + return NOTIFY_DONE; > + > + hpa = mce->addr & MCI_ADDR_PHYSADDR; > + > + /* Check if the PFN is located on this CXL device */ > + if (!pfn_valid(hpa >> PAGE_SHIFT) && > + !cxl_contains_hpa(mds->cxlds.cxlmd, hpa)) > + return NOTIFY_DONE; > + > + /* > + * Search PFN in the cxl_mce_records, if already exists, don't continue > + * to do memory_failure() to avoid a poison address being reported > + * more than once. > + */ > + if (cxl_mce_recorded(hpa)) > + return NOTIFY_STOP; > + else > + return NOTIFY_OK; > +} > + > struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev) > { > struct cxl_memdev_state *mds; > @@ -1427,6 +1553,10 @@ struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev) > mds->ram_perf.qos_class = CXL_QOS_CLASS_INVALID; > mds->pmem_perf.qos_class = CXL_QOS_CLASS_INVALID; > > + mds->mce_notifier.notifier_call = cxl_handle_mce; > + mds->mce_notifier.priority = MCE_PRIO_CXL; > + mce_register_decode_chain(&mds->mce_notifier); > + > return mds; > } > EXPORT_SYMBOL_NS_GPL(cxl_memdev_state_create, CXL); > diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c > index 0277726afd04..aa3ac89d17be 100644 > --- a/drivers/cxl/core/memdev.c > +++ b/drivers/cxl/core/memdev.c > @@ -376,10 +376,14 @@ int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa) > goto out; > > cxlr = cxl_dpa_to_region(cxlmd, dpa); > - if (cxlr) > + if (cxlr) { > + u64 hpa = cxl_trace_hpa(cxlr, cxlmd, dpa); > + > + cxl_mce_clear(hpa); > dev_warn_once(mds->cxlds.dev, > "poison clear dpa:%#llx region: %s\n", dpa, > dev_name(&cxlr->dev)); > + } > > record = (struct cxl_poison_record) { > .address = cpu_to_le64(dpa), > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h > index 19aba81cdf13..fbf8d9f46984 100644 > --- a/drivers/cxl/cxlmem.h > +++ b/drivers/cxl/cxlmem.h > @@ -501,6 +501,7 @@ struct cxl_memdev_state { > struct cxl_fw_state fw; > > struct rcuwait mbox_wait; > + struct notifier_block mce_notifier; > int (*mbox_send)(struct cxl_memdev_state *mds, > struct cxl_mbox_cmd *cmd); > }; > @@ -836,6 +837,8 @@ int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len, > int cxl_trigger_poison_list(struct cxl_memdev *cxlmd); > int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa); > int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa); > +bool cxl_mce_recorded(u64 pfn); > +void cxl_mce_clear(u64 pfn); > > #ifdef CONFIG_CXL_SUSPEND > void cxl_mem_active_inc(void);