* re-probe (unbind + bind).
@@ -965,8 +966,10 @@ static void xe_device_wedged_fini(struct drm_device *drm, void *arg)
* on every single execution timeout (a.k.a. GPU hang) right after devcoredump
* snapshot capture. In this mode, GT reset won't be attempted so the state of
* the issue is preserved for further debugging.
+ * Caller is expected to pass respective parameters to be sent along with
+ * uevent. Pass NULL in case of no params.
*/
-void xe_device_declare_wedged(struct xe_device *xe)
+void xe_device_declare_wedged(struct xe_device *xe, char **event_params)
{
struct xe_gt *gt;
u8 id;
@@ -984,12 +987,17 @@ void xe_device_declare_wedged(struct xe_device *xe)
xe_pm_runtime_get_noresume(xe);
if (!atomic_xchg(&xe->wedged.flag, 1)) {
+ struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
+
xe->needs_flr_on_fini = true;
drm_err(&xe->drm,
"CRITICAL: Xe has declared device %s as wedged.\n"
"IOCTLs and executions are blocked. Only a rebind may clear the failure\n"
"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
dev_name(xe->drm.dev));
+
+ /* Notify userspace about reset required */
+ kobject_uevent_env(&pdev->dev.kobj, KOBJ_CHANGE, event_params);
}
for_each_gt(gt, xe, id)
diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
index db6cc8d0d6b8..5d40fc6f0904 100644
--- a/drivers/gpu/drm/xe/xe_device.h
+++ b/drivers/gpu/drm/xe/xe_device.h
@@ -174,7 +174,7 @@ static inline bool xe_device_wedged(struct xe_device *xe)
return atomic_read(&xe->wedged.flag);
}
-void xe_device_declare_wedged(struct xe_device *xe);
+void xe_device_declare_wedged(struct xe_device *xe, char **reset_event);
struct xe_file *xe_file_get(struct xe_file *xef);
void xe_file_put(struct xe_file *xef);
diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
index 58895ed22f6e..519f3c2cf9e2 100644
--- a/drivers/gpu/drm/xe/xe_gt.c
+++ b/drivers/gpu/drm/xe/xe_gt.c
@@ -741,6 +741,24 @@ static int do_gt_restart(struct xe_gt *gt)
return 0;
}
+static void xe_gt_reset_failed(struct xe_gt *gt, int err)
+{
+ char *event_params[5];
+
+ xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err));
+
+ event_params[0] = DRM_XE_RESET_REQUIRED_UEVENT;
+ event_params[1] = DRM_XE_RESET_REQUIRED_UEVENT_REASON_GT;
+ event_params[2] = kasprintf(GFP_KERNEL, "TILE_ID=%d", gt_to_tile(gt)->id);
+ event_params[3] = kasprintf(GFP_KERNEL, "GT_ID=%d", gt->info.id);
+ event_params[4] = NULL;
+
+ xe_device_declare_wedged(gt_to_xe(gt), event_params);
+
+ kfree(event_params[2]);
+ kfree(event_params[3]);
+}
+
static int gt_reset(struct xe_gt *gt)
{
int err;
@@ -796,10 +814,7 @@ static int gt_reset(struct xe_gt *gt)
XE_WARN_ON(xe_uc_start(>->uc));
xe_pm_runtime_put(gt_to_xe(gt));
err_fail:
- xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err));
-
- xe_device_declare_wedged(gt_to_xe(gt));
-
+ xe_gt_reset_failed(gt, err);
return err;
}
diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
index de0fe9e65746..b544012f5b11 100644
--- a/drivers/gpu/drm/xe/xe_guc.c
+++ b/drivers/gpu/drm/xe/xe_guc.c
@@ -560,6 +560,17 @@ static s32 guc_pc_get_cur_freq(struct xe_guc_pc *guc_pc)
return ret ? ret : freq;
}
+static void xe_guc_load_failed(struct xe_gt *gt)
+{
+ char *event_params[3];
+
+ event_params[0] = DRM_XE_RESET_REQUIRED_UEVENT;
+ event_params[1] = DRM_XE_RESET_REQUIRED_UEVENT_REASON_GUC;
+ event_params[2] = NULL;
+
+ xe_device_declare_wedged(gt_to_xe(gt), event_params);
+}
+
/*
* Wait for the GuC to start up.
*
@@ -684,7 +695,7 @@ static void guc_wait_ucode(struct xe_guc *guc)
break;
}
- xe_device_declare_wedged(gt_to_xe(gt));
+ xe_guc_load_failed(gt);
} else if (delta_ms > GUC_LOAD_TIME_WARN_MS) {
xe_gt_warn(gt, "excessive init time: %lldms! [status = 0x%08X, timeouts = %d]\n",
delta_ms, status, count);
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 460808507947..33ed6221f465 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -891,6 +891,17 @@ void xe_guc_submit_wedge(struct xe_guc *guc)
mutex_unlock(&guc->submission_state.lock);
}
+static void xe_exec_queue_timedout(struct xe_device *xe)
+{
+ char *event_params[3];
+
+ event_params[0] = DRM_XE_RESET_REQUIRED_UEVENT;
+ event_params[1] = DRM_XE_RESET_REQUIRED_UEVENT_REASON_TOUT;
+ event_params[2] = NULL;
+
+ xe_device_declare_wedged(xe, event_params);
+}
+
static bool guc_submit_hint_wedged(struct xe_guc *guc)
{
struct xe_device *xe = guc_to_xe(guc);
@@ -901,7 +912,7 @@ static bool guc_submit_hint_wedged(struct xe_guc *guc)
if (xe_device_wedged(xe))
return true;
- xe_device_declare_wedged(xe);
+ xe_exec_queue_timedout(xe);
return true;
}
diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
index b6fbe4988f2e..dd2f36710057 100644
--- a/include/uapi/drm/xe_drm.h
+++ b/include/uapi/drm/xe_drm.h
@@ -20,6 +20,7 @@ extern "C" {
* 2. Extension definition and helper structs
* 3. IOCTL's Query structs in the order of the Query's entries.
* 4. The rest of IOCTL structs in the order of IOCTL declaration.
+ * 5. uEvents
*/
/**
@@ -1694,6 +1695,34 @@ struct drm_xe_oa_stream_info {
__u64 reserved[3];
};
+/**
+ * DOC: uevent generated by xe on it's pci node.
+ *
+ * DRM_XE_RESET_REQUIRED_UEVENT - Event is generated when device needs reset.
+ * The REASON is provided along with the event for which reset is required.
+ * On the basis of REASONS, additional information might be supplied.
+ */
+#define DRM_XE_RESET_REQUIRED_UEVENT "DEVICE_STATUS=NEEDS_RESET"
+
+/**
+ * DRM_XE_RESET_REQUIRED_UEVENT_REASON_GT - Reason provided to
+ * DRM_XE_RESET_REQUIRED_UEVENT incase of gt reset failure. The additional
+ * information supplied is tile id and gt id for which reset has failed.
+ */
+#define DRM_XE_RESET_REQUIRED_UEVENT_REASON_GT "REASON=GT_RESET_FAILED"
+
+/**
+ * DRM_XE_RESET_REQUIRED_UEVENT_REASON_GUC - Reason provided to
+ * DRM_XE_RESET_REQUIRED_UEVENT incase of guc fw load failure.
+ */
+#define DRM_XE_RESET_REQUIRED_UEVENT_REASON_GUC "REASON=GUC_LOAD_FAILED"
+
+/**
+ * DRM_XE_RESET_REQUIRED_UEVENT_REASON_TOUT - Reason provided to
+ * DRM_XE_RESET_REQUIRED_UEVENT incase of exec queue timeout.
+ */
+#define DRM_XE_RESET_REQUIRED_UEVENT_REASON_TOUT "REASON=EXEC_QUEUE_TIMEDOUT"
+
#if defined(__cplusplus)
}
#endif