Re: [PATCH 3/4] acpi/ghes, cxl/pci: Trace FW-First CXL Protocol Errors

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Dan,

On 6/11/2024 5:07 PM, Dan Williams wrote:
Smita Koralahalli wrote:
When PCIe AER is in FW-First, OS should process CXL Protocol errors from
CPER records.

Reuse the existing work queue cxl_cper_work registered with GHES to notify
the CXL subsystem on a Protocol error.

The defined trace events cxl_aer_uncorrectable_error and
cxl_aer_correctable_error currently trace native CXL AER errors. Reuse
them to trace FW-First Protocol Errors.

Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@xxxxxxx>
---
  drivers/acpi/apei/ghes.c  | 14 ++++++++++++++
  drivers/cxl/core/pci.c    | 24 ++++++++++++++++++++++++
  drivers/cxl/cxlpci.h      |  3 +++
  drivers/cxl/pci.c         | 34 ++++++++++++++++++++++++++++++++--
  include/linux/cxl-event.h |  1 +
  5 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 1a58032770ee..a31bd91e9475 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -723,6 +723,20 @@ static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err))
  		return;
+
+	guard(spinlock_irqsave)(&cxl_cper_work_lock);
+
+	if (!cxl_cper_work)
+		return;
+
+	wd.event_type = CXL_CPER_EVENT_PROT_ERR;
+
+	if (!kfifo_put(&cxl_cper_fifo, wd)) {
+		pr_err_ratelimited("CXL CPER kfifo overflow\n");
+		return;
+	}
+
+	schedule_work(cxl_cper_work);

This seems wrong to unconditionally schedule the cxl_pci driver to look
at potentially "non-device" errors. With Terry's upcoming CXL switch
port error handling there will be a native path for those errors, but
until that arrives, I see no point in this code trying to convey
root/switch port errors to the endpoint driver.

I see okay. What are your recommendations on this? Just confine it to CXL RCD, CXL SLD and CXL LD? And then extend it to ports once Terry sends patches?

Also, I'm not sure about FMLD. Should we just drop it as of now?


  }
int cxl_cper_register_work(struct work_struct *work)
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 0df09bd79408..ef9438cb1dd6 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -686,6 +686,30 @@ void read_cdat_data(struct cxl_port *port)
  }
  EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
+void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
+			struct cxl_cper_prot_err *p_err)
+{
+	u32 status, fe;
+
+	if (p_err->severity == CXL_AER_CORRECTABLE) {
+		status = p_err->cxl_ras.cor_status & ~p_err->cxl_ras.cor_mask;
+
+		trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
+	} else {
+		status = p_err->cxl_ras.uncor_status & ~p_err->cxl_ras.uncor_mask;
+
+		if (hweight32(status) > 1)
+			fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
+					   p_err->cxl_ras.cap_control));
+		else
+			fe = status;
+
+		trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
+						  p_err->cxl_ras.header_log);
+	}
+}
+EXPORT_SYMBOL_NS_GPL(cxl_trace_prot_err, CXL);
+
  static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds,
  				 void __iomem *ras_base)
  {
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 93992a1c8eec..0ba3215786e1 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -130,4 +130,7 @@ void read_cdat_data(struct cxl_port *port);
  void cxl_cor_error_detected(struct pci_dev *pdev);
  pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
  				    pci_channel_state_t state);
+struct cxl_cper_prot_err;
+void cxl_trace_prot_err(struct cxl_dev_state *cxlds,
+			struct cxl_cper_prot_err *p_err);
  #endif /* __CXL_PCI_H__ */
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 74876c9835e8..3e3c36983686 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -1011,12 +1011,42 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type,
  			       &uuid_null, &rec->event);
  }
+static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err)

Can we call this variable @rec instead of @p_err? The data being passed
is CPER data which is a "record" structure.

Will change.


+{
+	struct pci_dev *pdev __free(pci_dev_put) = NULL;
+	struct cxl_dev_state *cxlds;
+	unsigned int devfn;
+
+	devfn = PCI_DEVFN(p_err->device, p_err->function);
+	pdev = pci_get_domain_bus_and_slot(p_err->segment,
+					   p_err->bus, devfn);
+	if (!pdev)
+		return;
+
+	guard(device)(&pdev->dev);
+	if (pdev->driver != &cxl_pci_driver)
+		return;
+
+	cxlds = pci_get_drvdata(pdev);
+	if (!cxlds)
+		return;
+
+	if (((u64)p_err->upper_dw << 32 | p_err->lower_dw) != cxlds->serial)
+		pr_warn("CPER-reported device serial number does not match expected value\n");

Not much the end user can do about this warning, I would say skip this
message, or make it a pci_dbg() because matching by BDF should be
sufficient.

Will skip this message.

Thanks
Smita

+
+	cxl_trace_prot_err(cxlds, p_err);
+}
+
  static void cxl_cper_work_fn(struct work_struct *work)
  {
  	struct cxl_cper_work_data wd;
- while (cxl_cper_kfifo_get(&wd))
-		cxl_handle_cper_event(wd.event_type, &wd.rec);
+	while (cxl_cper_kfifo_get(&wd)) {
+		if (wd.event_type == CXL_CPER_EVENT_PROT_ERR)
+			cxl_handle_prot_err(&wd.p_err);
+		else
+			cxl_handle_cper_event(wd.event_type, &wd.rec);
+	}
  }
  static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn);
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 9c7b69e076a0..5562844df850 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -122,6 +122,7 @@ struct cxl_event_record_raw {
  } __packed;
enum cxl_event_type {
+	CXL_CPER_EVENT_PROT_ERR,
  	CXL_CPER_EVENT_GENERIC,
  	CXL_CPER_EVENT_GEN_MEDIA,
  	CXL_CPER_EVENT_DRAM,
--
2.17.1







[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Security]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux