On 06/08/24 10:02, Raag Jadav wrote: This change was originally sent by Himal, so may be you should keep his authorship. > From: Lucas De Marchi <lucas.demarchi@xxxxxxxxx> > > Bring back uevent for gt reset failure with better uapi naming. > With this in place we can receive failure event using udev. > > $ udevadm monitor --property --kernel > monitor will print the received events for: > KERNEL - the kernel uevent > > KERNEL[871.188570] change /devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0 (pci) > ACTION=change > DEVPATH=/devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0 > SUBSYSTEM=pci > DEVICE_STATUS=NEEDS_RESET > REASON=GT_RESET_FAILED > TILE_ID=0 > GT_ID=0 > DRIVER=xe > PCI_CLASS=30000 > PCI_ID=8086:56B1 > PCI_SUBSYS_ID=8086:1210 > PCI_SLOT_NAME=0000:03:00.0 > MODALIAS=pci:v00008086d000056B1sv00008086sd00001210bc03sc00i00 > SEQNUM=6104 > > Signed-off-by: Lucas De Marchi <lucas.demarchi@xxxxxxxxx> > Signed-off-by: Raag Jadav <raag.jadav@xxxxxxxxx> > --- > drivers/gpu/drm/xe/xe_gt.c | 27 +++++++++++++++++++++++++-- > include/uapi/drm/xe_drm.h | 17 +++++++++++++++++ > 2 files changed, 42 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c > index b04e47186f5b..5ceef0059861 100644 > --- a/drivers/gpu/drm/xe/xe_gt.c > +++ b/drivers/gpu/drm/xe/xe_gt.c > @@ -740,6 +740,30 @@ static int do_gt_restart(struct xe_gt *gt) > return 0; > } > > +static void xe_uevent_gt_reset_failure(struct pci_dev *pdev, u8 tile_id, u8 gt_id) > +{ > + char *reset_event[5]; > + > + reset_event[0] = DRM_XE_RESET_REQUIRED_UEVENT; > + reset_event[1] = DRM_XE_RESET_REQUIRED_UEVENT_REASON_GT; > + reset_event[2] = kasprintf(GFP_KERNEL, "TILE_ID=%d", tile_id); > + reset_event[3] = kasprintf(GFP_KERNEL, "GT_ID=%d", gt_id); > + reset_event[4] = NULL; > + kobject_uevent_env(&pdev->dev.kobj, KOBJ_CHANGE, reset_event); > + > + kfree(reset_event[2]); > + kfree(reset_event[3]); > +} > + > +static void gt_reset_failed(struct xe_gt *gt, int err) > +{ > + xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err)); > + > + /* Notify userspace about gt reset failure */ > + xe_uevent_gt_reset_failure(to_pci_dev(gt_to_xe(gt)->drm.dev), > + gt_to_tile(gt)->id, gt->info.id); > +} > + > static int gt_reset(struct xe_gt *gt) > { > int err; > @@ -795,8 +819,7 @@ static int gt_reset(struct xe_gt *gt) > XE_WARN_ON(xe_uc_start(>->uc)); > xe_pm_runtime_put(gt_to_xe(gt)); > err_fail: > - xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err)); > - > + gt_reset_failed(gt, err); > xe_device_declare_wedged(gt_to_xe(gt)); Also, we might want to have a RESET_REQUIRED event whenever device is wedged. Thanks, Aravind. > > return err; > diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h > index 19619d4952a8..9ea3be97535e 100644 > --- a/include/uapi/drm/xe_drm.h > +++ b/include/uapi/drm/xe_drm.h > @@ -20,6 +20,7 @@ extern "C" { > * 2. Extension definition and helper structs > * 3. IOCTL's Query structs in the order of the Query's entries. > * 4. The rest of IOCTL structs in the order of IOCTL declaration. > + * 5. uEvents > */ > > /** > @@ -1686,6 +1687,22 @@ struct drm_xe_oa_stream_info { > __u64 reserved[3]; > }; > > +/** > + * DOC: uevent generated by xe on it's pci node. > + * > + * DRM_XE_RESET_REQUIRED_UEVENT - Event is generated when device needs reset. > + * The REASON is provided along with the event for which reset is required. > + * On the basis of REASONS, additional information might be supplied. > + */ > +#define DRM_XE_RESET_REQUIRED_UEVENT "DEVICE_STATUS=NEEDS_RESET" > + > +/** > + * DRM_XE_RESET_REQUIRED_UEVENT_REASON_GT - Reason provided to DRM_XE_RESET_REQUIRED_UEVENT > + * incase of gt reset failure. The additional information supplied is tile id and > + * gt id of the gt unit for which reset has failed. > + */ > +#define DRM_XE_RESET_REQUIRED_UEVENT_REASON_GT "REASON=GT_RESET_FAILED" > + > #if defined(__cplusplus) > } > #endif