On Tue, Aug 13, 2024 at 04:28:32PM +0300, Raag Jadav wrote: > On Mon, Aug 12, 2024 at 03:08:14PM +0530, Aravind Iddamsetty wrote: > > > > On 12/08/24 13:18, Raag Jadav wrote: > > > From: Himal Prasad Ghimiray <himal.prasad.ghimiray@xxxxxxxxx> > > > > > > This was dropped in commit 77a0d4d1cea2 ("drm/xe/uapi: Remove reset > > > uevent for now") as part of refactoring. > > > > > > Now that we have better uapi semantics and naming for the uevent, > > > bring it back. With this in place, userspace will be notified of > > > wedged device along with its reason. > > > > > > $ udevadm monitor --property --kernel > > > monitor will print the received events for: > > > KERNEL - the kernel uevent > > > > > > KERNEL[871.188570] change /devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0 (pci) > > > ACTION=change > > > DEVPATH=/devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0 > > > SUBSYSTEM=pci > > > DEVICE_STATUS=NEEDS_RESET > > > REASON=GT_RESET_FAILED > > > TILE_ID=0 > > > GT_ID=0 > > > DRIVER=xe > > > PCI_CLASS=30000 > > > PCI_ID=8086:56B1 > > > PCI_SUBSYS_ID=8086:1210 > > > PCI_SLOT_NAME=0000:03:00.0 > > > MODALIAS=pci:v00008086d000056B1sv00008086sd00001210bc03sc00i00 > > > SEQNUM=6104 > > > > > > v2: Change authorship to Himal (Aravind) > > > Add uevent for all device wedged cases (Aravind) > > > > > > Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@xxxxxxxxx> > > > Signed-off-by: Raag Jadav <raag.jadav@xxxxxxxxx> > > > --- > > > drivers/gpu/drm/xe/xe_device.c | 10 +++++++++- > > > drivers/gpu/drm/xe/xe_device.h | 2 +- > > > drivers/gpu/drm/xe/xe_gt.c | 23 +++++++++++++++++++---- > > > drivers/gpu/drm/xe/xe_guc.c | 13 ++++++++++++- > > > drivers/gpu/drm/xe/xe_guc_submit.c | 13 ++++++++++++- > > > include/uapi/drm/xe_drm.h | 29 +++++++++++++++++++++++++++++ > > > 6 files changed, 82 insertions(+), 8 deletions(-) > > > > > > diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c > > > index 1aba6f9eaa19..d975bdce4a7d 100644 > > > --- a/drivers/gpu/drm/xe/xe_device.c > > > +++ b/drivers/gpu/drm/xe/xe_device.c > > > @@ -955,6 +955,7 @@ static void xe_device_wedged_fini(struct drm_device *drm, void *arg) > > > /** > > > * xe_device_declare_wedged - Declare device wedged > > > * @xe: xe device instance > > > + * @event_params: parameters to be sent along with uevent > > > * > > > * This is a final state that can only be cleared with a mudule > > > * re-probe (unbind + bind). > > > @@ -965,8 +966,10 @@ static void xe_device_wedged_fini(struct drm_device *drm, void *arg) > > > * on every single execution timeout (a.k.a. GPU hang) right after devcoredump > > > * snapshot capture. In this mode, GT reset won't be attempted so the state of > > > * the issue is preserved for further debugging. > > > + * Caller is expected to pass respective parameters to be sent along with > > > + * uevent. Pass NULL in case of no params. > > > */ > > > -void xe_device_declare_wedged(struct xe_device *xe) > > > +void xe_device_declare_wedged(struct xe_device *xe, char **event_params) > > > { > > > struct xe_gt *gt; > > > u8 id; > > > @@ -984,12 +987,17 @@ void xe_device_declare_wedged(struct xe_device *xe) > > > xe_pm_runtime_get_noresume(xe); > > > > > > if (!atomic_xchg(&xe->wedged.flag, 1)) { > > > + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); > > > + > > > xe->needs_flr_on_fini = true; > > > drm_err(&xe->drm, > > > "CRITICAL: Xe has declared device %s as wedged.\n" > > > "IOCTLs and executions are blocked. Only a rebind may clear the failure\n" > > > "Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n", > > > dev_name(xe->drm.dev)); > > > + > > > + /* Notify userspace about reset required */ > > > + kobject_uevent_env(&pdev->dev.kobj, KOBJ_CHANGE, event_params); > > > } > > > > > > for_each_gt(gt, xe, id) > > > diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h > > > index db6cc8d0d6b8..5d40fc6f0904 100644 > > > --- a/drivers/gpu/drm/xe/xe_device.h > > > +++ b/drivers/gpu/drm/xe/xe_device.h > > > @@ -174,7 +174,7 @@ static inline bool xe_device_wedged(struct xe_device *xe) > > > return atomic_read(&xe->wedged.flag); > > > } > > > > > > -void xe_device_declare_wedged(struct xe_device *xe); > > > +void xe_device_declare_wedged(struct xe_device *xe, char **reset_event); > > > > > > struct xe_file *xe_file_get(struct xe_file *xef); > > > void xe_file_put(struct xe_file *xef); > > > diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c > > > index 58895ed22f6e..519f3c2cf9e2 100644 > > > --- a/drivers/gpu/drm/xe/xe_gt.c > > > +++ b/drivers/gpu/drm/xe/xe_gt.c > > > @@ -741,6 +741,24 @@ static int do_gt_restart(struct xe_gt *gt) > > > return 0; > > > } > > > > > > +static void xe_gt_reset_failed(struct xe_gt *gt, int err) > > > +{ > > > + char *event_params[5]; > > > + > > > + xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err)); > > > + > > > + event_params[0] = DRM_XE_RESET_REQUIRED_UEVENT; > > > + event_params[1] = DRM_XE_RESET_REQUIRED_UEVENT_REASON_GT; > > > + event_params[2] = kasprintf(GFP_KERNEL, "TILE_ID=%d", gt_to_tile(gt)->id); > > > + event_params[3] = kasprintf(GFP_KERNEL, "GT_ID=%d", gt->info.id); > > > > the TILE_ID and GT_ID can be passed for other events as well, with that you can > > have a common function to send uevent which would take reason as an input. > > But is that required for all cases? There could be potential cases atleast > in the future where it is not needed. > > > > + event_params[4] = NULL; > > > + > > > + xe_device_declare_wedged(gt_to_xe(gt), event_params); > > > + > > > + kfree(event_params[2]); > > > + kfree(event_params[3]); > > > +} > > > + > > > static int gt_reset(struct xe_gt *gt) > > > { > > > int err; > > > @@ -796,10 +814,7 @@ static int gt_reset(struct xe_gt *gt) > > > XE_WARN_ON(xe_uc_start(>->uc)); > > > xe_pm_runtime_put(gt_to_xe(gt)); > > > err_fail: > > > - xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err)); > > > - > > > - xe_device_declare_wedged(gt_to_xe(gt)); > > > - > > > + xe_gt_reset_failed(gt, err); > > > return err; > > > } > > > > > > diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c > > > index de0fe9e65746..b544012f5b11 100644 > > > --- a/drivers/gpu/drm/xe/xe_guc.c > > > +++ b/drivers/gpu/drm/xe/xe_guc.c > > > @@ -560,6 +560,17 @@ static s32 guc_pc_get_cur_freq(struct xe_guc_pc *guc_pc) > > > return ret ? ret : freq; > > > } > > > > > > +static void xe_guc_load_failed(struct xe_gt *gt) > > > +{ > > > + char *event_params[3]; > > > + > > > + event_params[0] = DRM_XE_RESET_REQUIRED_UEVENT; > > > + event_params[1] = DRM_XE_RESET_REQUIRED_UEVENT_REASON_GUC; > > > + event_params[2] = NULL; > > > + > > > + xe_device_declare_wedged(gt_to_xe(gt), event_params); > > > +} > > > + > > > /* > > > * Wait for the GuC to start up. > > > * > > > @@ -684,7 +695,7 @@ static void guc_wait_ucode(struct xe_guc *guc) > > > break; > > > } > > > > > > - xe_device_declare_wedged(gt_to_xe(gt)); > > > + xe_guc_load_failed(gt); > > > } else if (delta_ms > GUC_LOAD_TIME_WARN_MS) { > > > xe_gt_warn(gt, "excessive init time: %lldms! [status = 0x%08X, timeouts = %d]\n", > > > delta_ms, status, count); > > > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c > > > index 460808507947..33ed6221f465 100644 > > > --- a/drivers/gpu/drm/xe/xe_guc_submit.c > > > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c > > > @@ -891,6 +891,17 @@ void xe_guc_submit_wedge(struct xe_guc *guc) > > > mutex_unlock(&guc->submission_state.lock); > > > } > > > > > > +static void xe_exec_queue_timedout(struct xe_device *xe) > > > +{ > > > + char *event_params[3]; > > > + > > > + event_params[0] = DRM_XE_RESET_REQUIRED_UEVENT; > > > + event_params[1] = DRM_XE_RESET_REQUIRED_UEVENT_REASON_TOUT; > > > + event_params[2] = NULL; > > > + > > > + xe_device_declare_wedged(xe, event_params); > > > +} > > > + > > > static bool guc_submit_hint_wedged(struct xe_guc *guc) > > > { > > > struct xe_device *xe = guc_to_xe(guc); > > > @@ -901,7 +912,7 @@ static bool guc_submit_hint_wedged(struct xe_guc *guc) > > > if (xe_device_wedged(xe)) > > > return true; > > > > > > > i see that this is even called from xe_guc_exec_queue_lr_cleanup which is for long running queue > > from various call paths need to check in what scenarios those happen. > > Should we add a reason for long running queue? Both of your questions would probably be answered if this was getting developed at the same time of the user space usage of these uevents. > > Raag