Dave Jiang wrote: > SBR is equivalent to a device been hot removed and inserted again. Doing a > SBR on a CXL type 3 device is problematic if the exported device memory is > part of system memory that cannot be offlined. The event is equivalent to > violently ripping out that range of memory from the kernel. While the > hardware requires the "Unmask SBR" bit set in the Port Control Extensions > register and the kernel currently does not unmask it, user can unmask > this bit via setpci or similar tool. > > The driver does not have a way to detect whether a reset coming from the > PCI subsystem is a Function Level Reset (FLR) or SBR. The only way to > detect is to note if a decoder is marked as enabled in software but the > decoder control register indicates it's not committed. > > A helper function is added to find discrepancy between the decoder > software state versus the hardware register state. > > Suggested-by: Dan Williams <dan.j.williams@xxxxxxxxx> > Signed-off-by: Dave Jiang <dave.jiang@xxxxxxxxx> > --- > v2: > - Fix typo in commit log > --- > drivers/cxl/core/port.c | 30 ++++++++++++++++++++++++++++++ > drivers/cxl/cxl.h | 2 ++ > drivers/cxl/pci.c | 19 +++++++++++++++++++ > 3 files changed, 51 insertions(+) > > diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c > index 2b0cab556072..e33d047f992b 100644 > --- a/drivers/cxl/core/port.c > +++ b/drivers/cxl/core/port.c > @@ -2222,6 +2222,36 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, > } > EXPORT_SYMBOL_NS_GPL(cxl_endpoint_get_perf_coordinates, CXL); > > +static int decoder_hw_mismatch(struct device *dev, void *data) I would just call this __cxl_endpoint_decoder_reset_detected(), which is clearer. > +{ > + struct cxl_endpoint_decoder *cxled; > + struct cxl_port *port = data; > + struct cxl_decoder *cxld; > + struct cxl_hdm *cxlhdm; > + void __iomem *hdm; > + u32 ctrl; > + > + if (!is_endpoint_decoder(dev)) > + return 0; > + > + cxled = to_cxl_endpoint_decoder(dev); > + if ((cxled->cxld.flags & CXL_DECODER_F_ENABLE) == 0) > + return 0; > + > + cxld = &cxled->cxld; > + cxlhdm = dev_get_drvdata(&port->dev); > + hdm = cxlhdm->regs.hdm_decoder; > + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(cxld->id)); There is no register access in cxl/core/port.c that is all handled in drivers/cxl/core/pci.c. That helps with cxl_test that knows it needs to intercept drivers/cxl/core/pci.c exports. > + > + return !FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMITTED, ctrl); > +} > + > +bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port) > +{ > + return device_for_each_child(&port->dev, port, decoder_hw_mismatch); > +} > +EXPORT_SYMBOL_NS_GPL(cxl_endpoint_decoder_reset_detected, CXL); > + > /* for user tooling to ensure port disable work has completed */ > static ssize_t flush_store(const struct bus_type *bus, const char *buf, size_t count) > { > diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h > index 534e25e2f0a4..e3c237c50b59 100644 > --- a/drivers/cxl/cxl.h > +++ b/drivers/cxl/cxl.h > @@ -895,6 +895,8 @@ void cxl_coordinates_combine(struct access_coordinate *out, > struct access_coordinate *c1, > struct access_coordinate *c2); > > +bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port); > + > /* > * Unit test builds overrides this to __weak, find the 'strong' version > * of these symbols in tools/testing/cxl/. > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c > index 13450e75f5eb..78457542aeec 100644 > --- a/drivers/cxl/pci.c > +++ b/drivers/cxl/pci.c > @@ -957,11 +957,30 @@ static void cxl_error_resume(struct pci_dev *pdev) > dev->driver ? "successful" : "failed"); > } > > +static void cxl_reset_done(struct pci_dev *pdev) > +{ > + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); > + struct cxl_memdev *cxlmd = cxlds->cxlmd; > + struct device *dev = &pdev->dev; > + > + /* > + * FLR does not expect to touch the HDM decoders and related registers. > + * SBR however will wipe all device configurations. > + * Issue warning if there was active decoder before reset that no > + * longer exists. > + */ > + if (cxl_endpoint_decoder_reset_detected(cxlmd->endpoint)) { > + dev_warn(dev, "SBR happened without memory regions removal.\n"); > + dev_warn(dev, "System may be unstable if regions hosted system memory.\n"); This should taint the kernel so that any future crash shows that someone had force reset an active CXL memory configuration. > + } > +} > + > static const struct pci_error_handlers cxl_error_handlers = { > .error_detected = cxl_error_detected, > .slot_reset = cxl_slot_reset, > .resume = cxl_error_resume, > .cor_error_detected = cxl_cor_error_detected, > + .reset_done = cxl_reset_done, > }; > > static struct pci_driver cxl_pci_driver = { > -- > 2.44.0 >