On Wed, Apr 24, 2024 at 10:42:59AM +0530, Aravind Iddamsetty wrote: > > On 24/04/24 05:19, Michał Winiarski wrote: > > On Mon, Apr 22, 2024 at 12:27:56PM +0530, Aravind Iddamsetty wrote: > >> PCI subsystem provides callbacks to inform the driver about a request to > >> do function level reset by user, initiated by writing to sysfs entry > >> /sys/bus/pci/devices/.../reset. This will allow the driver to handle FLR > >> without the need to do unbind and rebind as the driver needs to > >> reinitialize the device afresh post FLR. > >> > >> v2: > >> 1. separate out gt idle and pci save/restore to a separate patch (Lucas) > >> 2. Fixed the warnings seen around xe_guc_submit_stop, xe_guc_puc_fini > >> > >> v3: declare xe_pci_err_handlers as static(Michal) > >> > >> Cc: Rodrigo Vivi <rodrigo.vivi@xxxxxxxxx> > >> Cc: Lucas De Marchi <lucas.demarchi@xxxxxxxxx> > >> Cc: Michal Wajdeczko <michal.wajdeczko@xxxxxxxxx> > >> > >> Reviewed-by: Rodrigo Vivi <rodrigo.vivi@xxxxxxxxx> > >> Signed-off-by: Aravind Iddamsetty <aravind.iddamsetty@xxxxxxxxxxxxxxx> > >> --- > >> drivers/gpu/drm/xe/Makefile | 1 + > >> drivers/gpu/drm/xe/xe_device_types.h | 3 + > >> drivers/gpu/drm/xe/xe_guc_pc.c | 4 ++ > >> drivers/gpu/drm/xe/xe_pci.c | 9 ++- > >> drivers/gpu/drm/xe/xe_pci.h | 2 + > >> drivers/gpu/drm/xe/xe_pci_err.c | 88 ++++++++++++++++++++++++++++ > >> drivers/gpu/drm/xe/xe_pci_err.h | 13 ++++ > >> 7 files changed, 119 insertions(+), 1 deletion(-) > >> create mode 100644 drivers/gpu/drm/xe/xe_pci_err.c > >> create mode 100644 drivers/gpu/drm/xe/xe_pci_err.h > >> > >> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile > >> index 8bc62bfbc679..693971a1fac0 100644 > >> --- a/drivers/gpu/drm/xe/Makefile > >> +++ b/drivers/gpu/drm/xe/Makefile > >> @@ -117,6 +117,7 @@ xe-y += xe_bb.o \ > >> xe_module.o \ > >> xe_pat.o \ > >> xe_pci.o \ > >> + xe_pci_err.o \ > >> xe_pcode.o \ > >> xe_pm.o \ > >> xe_preempt_fence.o \ > >> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h > >> index 0a66555229e9..8c749b378a92 100644 > >> --- a/drivers/gpu/drm/xe/xe_device_types.h > >> +++ b/drivers/gpu/drm/xe/xe_device_types.h > >> @@ -465,6 +465,9 @@ struct xe_device { > >> /** @pci_state: PCI state of device */ > >> struct pci_saved_state *pci_state; > >> > >> + /** @pci_device_is_reset: device went through PCIe FLR */ > >> + bool pci_device_is_reset; > >> + > >> /* private: */ > >> > >> #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY) > >> diff --git a/drivers/gpu/drm/xe/xe_guc_pc.c b/drivers/gpu/drm/xe/xe_guc_pc.c > >> index 509649d0e65e..efba0fbe2f5c 100644 > >> --- a/drivers/gpu/drm/xe/xe_guc_pc.c > >> +++ b/drivers/gpu/drm/xe/xe_guc_pc.c > >> @@ -902,6 +902,10 @@ static void xe_guc_pc_fini(struct drm_device *drm, void *arg) > >> return; > >> } > >> > >> + /* We already have done this before going through a reset, so skip here */ > >> + if (xe->pci_device_is_reset) > >> + return; > >> + > >> XE_WARN_ON(xe_force_wake_get(gt_to_fw(pc_to_gt(pc)), XE_FORCEWAKE_ALL)); > >> XE_WARN_ON(xe_guc_pc_gucrc_disable(pc)); > >> XE_WARN_ON(xe_guc_pc_stop(pc)); > >> diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c > >> index a62300990e19..b5a582afc9e7 100644 > >> --- a/drivers/gpu/drm/xe/xe_pci.c > >> +++ b/drivers/gpu/drm/xe/xe_pci.c > >> @@ -23,6 +23,7 @@ > >> #include "xe_macros.h" > >> #include "xe_mmio.h" > >> #include "xe_module.h" > >> +#include "xe_pci_err.h" > >> #include "xe_pci_types.h" > >> #include "xe_pm.h" > >> #include "xe_sriov.h" > >> @@ -738,7 +739,7 @@ static void xe_pci_remove(struct pci_dev *pdev) > >> pci_set_drvdata(pdev, NULL); > >> } > >> > >> -static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) > >> +int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) > >> { > >> const struct xe_device_desc *desc = (const void *)ent->driver_data; > >> const struct xe_subplatform_desc *subplatform_desc; > >> @@ -986,6 +987,11 @@ static const struct dev_pm_ops xe_pm_ops = { > >> }; > >> #endif > >> > >> +static const struct pci_error_handlers xe_pci_err_handlers = { > >> + .reset_prepare = xe_pci_reset_prepare, > >> + .reset_done = xe_pci_reset_done, > >> +}; > >> + > >> static struct pci_driver xe_pci_driver = { > >> .name = DRIVER_NAME, > >> .id_table = pciidlist, > >> @@ -995,6 +1001,7 @@ static struct pci_driver xe_pci_driver = { > >> #ifdef CONFIG_PM_SLEEP > >> .driver.pm = &xe_pm_ops, > >> #endif > >> + .err_handler = &xe_pci_err_handlers, > >> }; > >> > >> int xe_register_pci_driver(void) > >> diff --git a/drivers/gpu/drm/xe/xe_pci.h b/drivers/gpu/drm/xe/xe_pci.h > >> index 73b90a430d1f..9faf5380a09e 100644 > >> --- a/drivers/gpu/drm/xe/xe_pci.h > >> +++ b/drivers/gpu/drm/xe/xe_pci.h > >> @@ -7,8 +7,10 @@ > >> #define _XE_PCI_H_ > >> > >> struct pci_dev; > >> +struct pci_device_id; > >> > >> int xe_register_pci_driver(void); > >> void xe_unregister_pci_driver(void); > >> void xe_load_pci_state(struct pci_dev *pdev); > >> +int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent); > >> #endif > >> diff --git a/drivers/gpu/drm/xe/xe_pci_err.c b/drivers/gpu/drm/xe/xe_pci_err.c > >> new file mode 100644 > >> index 000000000000..5306925ea2fa > >> --- /dev/null > >> +++ b/drivers/gpu/drm/xe/xe_pci_err.c > >> @@ -0,0 +1,88 @@ > >> +// SPDX-License-Identifier: MIT > >> +/* > >> + * Copyright © 2024 Intel Corporation > >> + */ > >> + > >> +#include <linux/pci.h> > >> +#include <drm/drm_drv.h> > >> + > >> +#include "xe_device.h" > >> +#include "xe_gt.h" > >> +#include "xe_gt_printk.h" > >> +#include "xe_pci.h" > >> +#include "xe_pci_err.h" > >> +#include "xe_pm.h" > >> +#include "xe_uc.h" > >> + > >> +/** > >> + * xe_pci_reset_prepare - Called when user issued a PCIe reset > >> + * via /sys/bus/pci/devices/.../reset. > >> + * @pdev: PCI device struct > >> + */ > >> +void xe_pci_reset_prepare(struct pci_dev *pdev) > >> +{ > >> + struct xe_device *xe = pci_get_drvdata(pdev); > >> + struct xe_gt *gt; > >> + int id, err; > >> + > >> + pci_warn(pdev, "preparing for PCIe reset\n"); > >> + > >> + drm_warn(&xe->drm, "removing device access to userspace\n"); > > Warn? Shouldn't it be info? > I believe warn is appropriate as this is not a usual path the users apps expect > to hit and as they loose device connection. It's an expected path after issuing a reset via /sys/bus/pci/devices/.../reset. DRM device disappearing - yeah, that's probably not expected, fully agree on that. > > > >> + drm_dev_unplug(&xe->drm); > >> + > >> + xe_pm_runtime_get(xe); > >> + /* idle the GTs */ > >> + for_each_gt(gt, xe, id) { > >> + err = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL); > >> + if (err) > >> + goto reset; > >> + xe_uc_reset_prepare(>->uc); > >> + xe_gt_idle(gt); > >> + err = xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL); > >> + XE_WARN_ON(err); > >> + } > >> + xe_pm_runtime_put(xe); > >> + > >> +reset: > >> + pci_disable_device(pdev); > >> +} > >> + > >> +/** > >> + * xe_pci_reset_done - Called when PCIe reset is done. > >> + * @pdev: PCI device struct > >> + */ > >> +void xe_pci_reset_done(struct pci_dev *pdev) > >> +{ > >> + const struct pci_device_id *ent = pci_match_id(pdev->driver->id_table, pdev); > >> + struct xe_device *xe = pci_get_drvdata(pdev); > >> + > >> + dev_info(&pdev->dev, > >> + "device went through PCIe reset, reenabling the device\n"); > >> + > >> + if (pci_enable_device(pdev)) { > >> + dev_err(&pdev->dev, > >> + "Cannot re-enable PCI device after reset\n"); > >> + return; > >> + } > >> + pci_set_master(pdev); > >> + xe_load_pci_state(pdev); > >> + > >> + xe->pci_device_is_reset = true; > >> + /* > >> + * We want to completely clean the driver and even destroy > >> + * the xe private data and reinitialize afresh similar to > >> + * probe > >> + */ > >> + pdev->driver->remove(pdev); > > Do we really want to do that? > > First of all, that fairly unusual - none of the other PCI drivers makes > > changes in the device/driver model during reset, which makes me wonder > > if this is really what the user expects. > > I would expect that the device is reset, but the driver is not unloaded > > and previously created FDs are still valid. > We cannot continue to work with previous instance of driver as post > reset GuC , LMEM and most of the soc is reset, so we need to reinitialize > device afresh like in driver probe, hence we remove to clean the previous > stale driver state and re-probe again. GuC, LMEM and most of the soc is reset when we're doing S3 / S4, and yet, we're not removing the driver in this scenario. > > All applications are expected to reopen the device handles afresh. Current UMDs are not handling that case. > > > > > Note that messing with the driver model at reset also makes it difficult > > to do a proper cleanup if SR-IOV is enabled, as PCI-core expects drivers > > to remove VF devices during driver->remove. > > Removal of said devices depends on pci_cfg_access_lock, which is already > > held for the duration of the reset (wh > I haven't verified SRIOV use case, i believe we can take this up as next step. > Also, since this is not an automatic reset but a user/admin initiated i believe > the onus can be on admin to close all VFs before initiating a reset. That's not the contract for reset. It's not a "user/admin" interface, there doesn't necessarily have to be an "interactive" user interfacing with it through shell. It's just a regular stable sysfs uAPI - it could very well be called by application as well. > > ich includes calling reset_done), > > which will cause a deadlock. > > > > In current form, it also causes WARN if there are open FDs to DRM > > device during reset. > I did verify using igt@device_reset@reset-bound which has a open connection haven't seen this error > but will re verify again but again certain warns are expected but driver reload should be successful. > similarly here as well we can expect the admin to close any applications using the device before > initiating a reset. Can we expect this? If we can - why are you stating that "all applications are expected to reopen the device handles afresh"? I believe that the expectation for reset is that it behaves simiarly to the PM flows, which is mostly transparent to the users. Thanks, -Michał > > Thanks, > > Aravind. > > > > [29357.277699] sysfs: cannot create duplicate filename '/devices/pci0000:00/0000:00:02.0/tile0' 01:06:58 [142/47263] > > [29357.286158] CPU: 7 PID: 3479 Comm: bash Not tainted 6.9.0-rc5-xe+ #78 > > [29357.305870] Call Trace: > > [29357.308342] <TASK> > > [29357.310453] dump_stack_lvl+0x139/0x1d0 > > [29357.314305] ? __pfx_dump_stack_lvl+0x10/0x10 > > [29357.318680] ? __pfx__printk+0x10/0x10 > > [29357.322450] ? kmalloc_trace+0x1c1/0x3a0 > > [29357.326394] ? sysfs_create_dir_ns+0x162/0x210 > > [29357.330861] sysfs_create_dir_ns+0x195/0x210 > > [29357.335154] ? __pfx_sysfs_create_dir_ns+0x10/0x10 > > [29357.339965] ? strcmp+0x2f/0x60 > > [29357.343134] kobject_add_internal+0x2af/0x600 > > [29357.347517] kobject_add+0xfb/0x190 > > [29357.351028] ? __link_object+0x1c7/0x280 > > [29357.354976] ? __pfx_kobject_add+0x10/0x10 > > [29357.359093] ? __create_object+0x62/0x140 > > [29357.363111] ? rcu_is_watching+0x20/0x50 > > [29357.367055] ? kmalloc_trace+0x1c1/0x3a0 > > [29357.370998] ? xe_tile_sysfs_init+0x4b/0x100 [xe] > > [29357.376016] xe_tile_sysfs_init+0x91/0x100 [xe] > > [29357.380868] xe_tile_init_noalloc+0xaf/0xc0 [xe] > > [29357.385936] xe_device_probe+0x60c/0x750 [xe] > > [29357.390674] ? __asan_memcpy+0x40/0x70 > > [29357.394461] ? __drmm_add_action+0x1ac/0x210 [drm] > > [29357.399413] xe_pci_probe+0x5dc/0x680 [xe] > > [29357.403882] pci_reset_function+0x108/0x140 > > [29357.408100] reset_store+0xb5/0x120 > > [29357.411623] ? __pfx_reset_store+0x10/0x10 > > [29357.415751] ? __pfx_sysfs_kf_write+0x10/0x10 > > [29357.420149] kernfs_fop_write_iter+0x1b8/0x260 > > [29357.424620] vfs_write+0x5ce/0x660 > > [29357.428067] ? __pfx_vfs_write+0x10/0x10 > > [29357.432028] ? __rcu_read_unlock+0x60/0x90 > > [29357.436181] ksys_write+0xe4/0x190 > > [29357.439612] ? __pfx_ksys_write+0x10/0x10 > > [29357.443657] do_syscall_64+0x7b/0x120 > > [29357.447348] entry_SYSCALL_64_after_hwframe+0x76/0x7e > > [29357.452423] RIP: 0033:0x7f4f1ff14887 > > [29357.456030] Code: 10 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 > > [29357.474761] RSP: 002b:00007ffe54e95068 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 > > [29357.482343] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f4f1ff14887 > > [29357.489487] RDX: 0000000000000002 RSI: 0000559eb4076e30 RDI: 0000000000000001 > > [29357.496630] RBP: 0000559eb4076e30 R08: 0000000000000000 R09: 0000559eb4076e30 > > [29357.503775] R10: 0000000000000077 R11: 0000000000000246 R12: 0000000000000002 > > [29357.510947] R13: 00007f4f2001b780 R14: 00007f4f20017600 R15: 00007f4f20016a00 > > [29357.518174] </TASK> > > [29357.520539] kobject: kobject_add_internal failed for tile0 with -EEXIST, don't try to register things with the same name in the same directory. > > > > -Michał > > > >> + if (pci_dev_msi_enabled(pdev)) > >> + pci_free_irq_vectors(pdev); > >> + > >> + devm_drm_dev_release_action(&xe->drm); > >> + pci_disable_device(pdev); > >> + > >> + /* > >> + * if this fails the driver might be in a stale state, only option is > >> + * to unbind and rebind > >> + */ > >> + xe_pci_probe(pdev, ent); > >> +} > >> diff --git a/drivers/gpu/drm/xe/xe_pci_err.h b/drivers/gpu/drm/xe/xe_pci_err.h > >> new file mode 100644 > >> index 000000000000..95a4c8ce9cf1 > >> --- /dev/null > >> +++ b/drivers/gpu/drm/xe/xe_pci_err.h > >> @@ -0,0 +1,13 @@ > >> +/* SPDX-License-Identifier: MIT */ > >> +/* > >> + * Copyright © 2024 Intel Corporation > >> + */ > >> + > >> +#ifndef _XE_PCI_ERR_H_ > >> +#define _XE_PCI_ERR_H_ > >> + > >> +struct pci_dev; > >> + > >> +void xe_pci_reset_prepare(struct pci_dev *pdev); > >> +void xe_pci_reset_done(struct pci_dev *pdev); > >> +#endif > >> -- > >> 2.25.1 > >>