Hi Jonathan, On 10/30/2024 11:07 AM, Jonathan Cameron wrote: > On Fri, 25 Oct 2024 16:03:04 -0500 > Terry Bowman <terry.bowman@xxxxxxx> wrote: > >> The CXL drivers use kernel trace functions for logging endpoint and >> RCH downstream port RAS errors. Similar functionality is >> required for CXL root ports, CXL downstream switch ports, and CXL >> upstream switch ports. >> >> Introduce trace logging functions for both RAS correctable and >> uncorrectable errors specific to CXL PCIe ports. Additionally, update >> the PCIe port error handlers to invoke these new trace functions. >> >> Signed-off-by: Terry Bowman <terry.bowman@xxxxxxx> > +CC Mauro and Shiju to give the tracepoint a sanity check and for > awareness that we have something new to feed rasdaemon :) > > Jonathan > >> --- >> drivers/cxl/core/pci.c | 16 ++++++++++---- >> drivers/cxl/core/trace.h | 47 ++++++++++++++++++++++++++++++++++++++++ >> 2 files changed, 59 insertions(+), 4 deletions(-) >> >> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c >> index adb184d346ae..eeb4a64ba5b5 100644 >> --- a/drivers/cxl/core/pci.c >> +++ b/drivers/cxl/core/pci.c >> @@ -661,10 +661,14 @@ static void __cxl_handle_cor_ras(struct device *dev, >> >> addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; >> status = readl(addr); >> - if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { >> - writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); >> + if (!(status & CXL_RAS_CORRECTABLE_STATUS_MASK)) >> + return; >> + writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); >> + >> + if (is_cxl_memdev(dev)) >> trace_cxl_aer_correctable_error(to_cxl_memdev(dev), status); >> - } >> + else if (dev_is_pci(dev)) > How would you get here otherwise? Is it useful to know it is a pci device > here? This dev_is_pci() check is not necessary and can be removed. >> + trace_cxl_port_aer_correctable_error(dev, status); >> } >> >> static void cxl_handle_endpoint_cor_ras(struct cxl_dev_state *cxlds) >> @@ -720,7 +724,11 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base) >> } >> >> header_log_copy(ras_base, hl); >> - trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl); >> + if (is_cxl_memdev(dev)) >> + trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl); >> + else if (dev_is_pci(dev)) > as above. Got it and thank you. >> + trace_cxl_port_aer_uncorrectable_error(dev, status, fe, hl); >> + >> writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); >> >> return true; >> diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h >> index 8672b42ee4d1..1c4368a7b50b 100644 >> --- a/drivers/cxl/core/trace.h >> +++ b/drivers/cxl/core/trace.h >> @@ -48,6 +48,34 @@ >> { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \ >> ) >> >> +TRACE_EVENT(cxl_port_aer_uncorrectable_error, >> + TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl), >> + TP_ARGS(dev, status, fe, hl), >> + TP_STRUCT__entry( >> + __string(devname, dev_name(dev)) >> + __string(host, dev_name(dev->parent)) >> + __field(u32, status) >> + __field(u32, first_error) >> + __array(u32, header_log, CXL_HEADERLOG_SIZE_U32) >> + ), >> + TP_fast_assign( >> + __assign_str(devname); >> + __assign_str(host); >> + __entry->status = status; >> + __entry->first_error = fe; >> + /* >> + * Embed the 512B headerlog data for user app retrieval and >> + * parsing, but no need to print this in the trace buffer. > I'm not sure any printing as such goes on in the trace buffer. It is from > the data in the trace buffer I think. Right, the comment indicates it is not printed but included here because the buffer can be accessed by applications. Regards, Terry >> + */ >> + memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE); >> + ), >> + TP_printk("device=%s host=%s status: '%s' first_error: '%s'", >> + __get_str(devname), __get_str(host), >> + show_uc_errs(__entry->status), >> + show_uc_errs(__entry->first_error) >> + ) >> +); >> + >> TRACE_EVENT(cxl_aer_uncorrectable_error, >> TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl), >> TP_ARGS(cxlmd, status, fe, hl), >> @@ -96,6 +124,25 @@ TRACE_EVENT(cxl_aer_uncorrectable_error, >> { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \ >> ) >> >> +TRACE_EVENT(cxl_port_aer_correctable_error, >> + TP_PROTO(struct device *dev, u32 status), >> + TP_ARGS(dev, status), >> + TP_STRUCT__entry( >> + __string(devname, dev_name(dev)) >> + __string(host, dev_name(dev->parent)) >> + __field(u32, status) >> + ), >> + TP_fast_assign( >> + __assign_str(devname); >> + __assign_str(host); >> + __entry->status = status; >> + ), >> + TP_printk("device=%s host=%s status='%s'", >> + __get_str(devname), __get_str(host), >> + show_ce_errs(__entry->status) >> + ) >> +); >> + >> TRACE_EVENT(cxl_aer_correctable_error, >> TP_PROTO(const struct cxl_memdev *cxlmd, u32 status), >> TP_ARGS(cxlmd, status),