Re: [PATCH v3 7/7] acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 11/26/2024 8:05 AM, Jonathan Cameron wrote:
On Tue, 19 Nov 2024 00:39:15 +0000
Smita Koralahalli <Smita.KoralahalliChannabasappa@xxxxxxx> wrote:

When PCIe AER is in FW-First, OS should process CXL Protocol errors from
CPER records. Introduce support for handling and logging CXL Protocol
errors.

The defined trace events cxl_aer_uncorrectable_error and
cxl_aer_correctable_error trace native CXL AER endpoint errors, while
cxl_cper_trace_corr_prot_err and cxl_cper_trace_uncorr_prot_err
trace native CXL AER port errors. Reuse both sets to trace FW-First
protocol errors.

Since the CXL code is required to be called from process context and
GHES is in interrupt context, use workqueues for processing.

Similar to CXL CPER event handling, use kfifo to handle errors as it
simplifies queue processing by providing lock free fifo operations.

Add the ability for the CXL sub-system to register a workqueue to
process CXL CPER protocol errors.

Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@xxxxxxx>

A few minor comments inline.

Thanks

Jonathan

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 4ede038a7148..c992b34c290b 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -650,6 +650,56 @@ void read_cdat_data(struct cxl_port *port)
  }
  EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
+void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev, bool flag,
+				  struct cxl_ras_capability_regs ras_cap)
+{
+	struct cxl_dev_state *cxlds;
+	u32 status;
+
+	status = ras_cap.cor_status & ~ras_cap.cor_mask;
+
+	if (!flag) {

As below. Name of flag is not very helpful when reading the code.
Perhaps we can rename?

Okay. May be flag -> is_device_error ?

+		trace_cxl_port_aer_correctable_error(&pdev->dev, status);
+		return;
+	}
+
+	cxlds = pci_get_drvdata(pdev);
+	if (!cxlds)
+		return;
+
+	trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_trace_corr_prot_err, CXL);
+
+void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev, bool flag,
+				    struct cxl_ras_capability_regs ras_cap)
+{
+	struct cxl_dev_state *cxlds;
+	u32 status, fe;
+
+	status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
+
+	if (hweight32(status) > 1)
+		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
+				   ras_cap.cap_control));
+	else
+		fe = status;
+
+	if (!flag) {

Why does  a bool named flag indicate it's a port error?

I will rename it.

Or may be use an enum to explicitly define the error type (CXL_ERROR_TYPE_DEVICE and CXL_ERROR_TYPE_PORT).

Or may be split the function into two distinct ones, one for port errors and one for device errors.

Let me know your preference or any other suggestions here. I will change it accordingly.


+		trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe,
+						       ras_cap.header_log);
+		return;
+	}
+
+	cxlds = pci_get_drvdata(pdev);
+	if (!cxlds)
+		return;
+
+	trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
+					  ras_cap.header_log);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_trace_uncorr_prot_err, CXL);
+
  static void __cxl_handle_cor_ras(struct device *dev,
  				 void __iomem *ras_base)
  {
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 4da07727ab9c..5e4aa8681937 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -129,4 +129,10 @@ void read_cdat_data(struct cxl_port *port);
  void cxl_cor_error_detected(struct pci_dev *pdev);
  pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
  				    pci_channel_state_t state);
+
+struct cxl_ras_capability_regs;
+void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev, bool flag,
+				  struct cxl_ras_capability_regs ras_cap);
+void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev, bool flag,
+				    struct cxl_ras_capability_regs ras_cap);
  #endif /* __CXL_PCI_H__ */
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 88a14d7baa65..e261abe60e90 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -1067,6 +1067,53 @@ static void cxl_cper_work_fn(struct work_struct *work)
  }
  static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn);
+static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
+{
+	unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
+				       data->prot_err.agent_addr.function);
+	struct pci_dev *pdev __free(pci_dev_put) =
+		pci_get_domain_bus_and_slot(
+			data->prot_err.agent_addr.segment,
+			data->prot_err.agent_addr.bus,
+			devfn
+		);
		pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
					    data->prot_err.agent_addr.bus,
					    devfn);

Noted.


+	int port_type;
+
+	if (!pdev)
+		return;
+
+	guard(device)(&pdev->dev);
+	if (pdev->driver != &cxl_pci_driver)
+		return;
+
+	port_type = pci_pcie_type(pdev);
+	if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
+	    port_type == PCI_EXP_TYPE_DOWNSTREAM ||
+	    port_type == PCI_EXP_TYPE_UPSTREAM) {
+		if (data->severity == AER_CORRECTABLE)
+			cxl_cper_trace_corr_prot_err(pdev, false, data->ras_cap);
+		else
+			cxl_cper_trace_uncorr_prot_err(pdev, false, data->ras_cap);
+
+		return;
+	}
+
+	if (data->severity == AER_CORRECTABLE)
+		cxl_cper_trace_corr_prot_err(pdev, true, data->ras_cap);
+	else
+		cxl_cper_trace_uncorr_prot_err(pdev, true, data->ras_cap);
+
+}

  static int __init cxl_pci_driver_init(void)
  {
  	int rc;
@@ -1079,13 +1126,21 @@ static int __init cxl_pci_driver_init(void)
  	if (rc)
  		pci_unregister_driver(&cxl_pci_driver);
+ rc = cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
+	if (rc) {
+		cxl_cper_unregister_event_work(&cxl_cper_work);
+		pci_unregister_driver(&cxl_pci_driver);
I'd switch this to a goto style for error handling.


+	}
+
  	return rc;

that is
	return 0;

err_unregister_event_work:
	cxl_cper_unregister_event_work(&cxl_cper_work);
err_unreg:
	pci_unregister_driver(&cxl_pci_driver);
	return rc;
  }

Noted.

Thanks
Smita





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Security]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux