---
drivers/gpu/drm/i915/gt/uc/intel_guc.c | 38 +++++++++++++++++++++++
drivers/gpu/drm/i915/gt/uc/intel_guc.h | 15 +++++++++
drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 6 +---
3 files changed, 54 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
index 569b5fe94c416..12a817b762334 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
@@ -159,6 +159,21 @@ static void gen11_disable_guc_interrupts(struct intel_guc *guc)
gen11_reset_guc_interrupts(guc);
}
+static void guc_dead_worker_func(struct work_struct *w)
+{
+ struct intel_guc *guc = container_of(w, struct intel_guc, dead_guc_worker);
+ struct intel_gt *gt = guc_to_gt(guc);
+ unsigned long last = guc->last_dead_guc_jiffies;
+ unsigned long delta = jiffies_to_msecs(jiffies - last);
+
+ if (delta < 500) {
+ intel_gt_set_wedged(gt);
+ } else {
+ intel_gt_handle_error(gt, ALL_ENGINES, I915_ERROR_CAPTURE, "dead GuC");
+ guc->last_dead_guc_jiffies = jiffies;
+ }
+}
+
void intel_guc_init_early(struct intel_guc *guc)
{
struct intel_gt *gt = guc_to_gt(guc);
@@ -171,6 +186,8 @@ void intel_guc_init_early(struct intel_guc *guc)
intel_guc_slpc_init_early(&guc->slpc);
intel_guc_rc_init_early(guc);
+ INIT_WORK(&guc->dead_guc_worker, guc_dead_worker_func);
+
mutex_init(&guc->send_mutex);
spin_lock_init(&guc->irq_lock);
if (GRAPHICS_VER(i915) >= 11) {
@@ -461,6 +478,8 @@ void intel_guc_fini(struct intel_guc *guc)
if (!intel_uc_fw_is_loadable(&guc->fw))
return;
+ flush_work(&guc->dead_guc_worker);
+
if (intel_guc_slpc_is_used(guc))
intel_guc_slpc_fini(&guc->slpc);
@@ -585,6 +604,20 @@ int intel_guc_send_mmio(struct intel_guc *guc, const u32 *request, u32 len,
return ret;
}
+int intel_guc_crash_process_msg(struct intel_guc *guc, u32 action)
+{
+ if (action == INTEL_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED)
+ guc_err(guc, "Crash dump notification\n");
+ else if (action == INTEL_GUC_ACTION_NOTIFY_EXCEPTION)
+ guc_err(guc, "Exception notification\n");
+ else
+ guc_err(guc, "Unknown crash notification: 0x%04X\n", action);
+
+ queue_work(system_unbound_wq, &guc->dead_guc_worker);
+
+ return 0;
+}
+
int intel_guc_to_host_process_recv_msg(struct intel_guc *guc,
const u32 *payload, u32 len)
{
@@ -601,6 +634,9 @@ int intel_guc_to_host_process_recv_msg(struct intel_guc *guc,
if (msg & INTEL_GUC_RECV_MSG_EXCEPTION)
guc_err(guc, "Received early exception notification!\n");
+ if (msg & (INTEL_GUC_RECV_MSG_CRASH_DUMP_POSTED | INTEL_GUC_RECV_MSG_EXCEPTION))
+ queue_work(system_unbound_wq, &guc->dead_guc_worker);
+
return 0;
}
@@ -640,6 +676,8 @@ int intel_guc_suspend(struct intel_guc *guc)
return 0;
if (intel_guc_submission_is_used(guc)) {
+ flush_work(&guc->dead_guc_worker);
+
/*
* This H2G MMIO command tears down the GuC in two steps. First it will
* generate a G2H CTB for every active context indicating a reset. In
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index 8dc291ff00935..6c392bad29c19 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -266,6 +266,20 @@ struct intel_guc {
unsigned long last_stat_jiffies;
} timestamp;
+ /**
+ * @dead_guc_worker: Asynchronous worker thread for forcing a GuC reset.
+ * Specifically used when the G2H handler wants to issue a reset. Resets
+ * require flushing the G2H queue. So, the G2H processing itself must not
+ * trigger a reset directly. Instead, go via this worker.
+ */
+ struct work_struct dead_guc_worker;
+ /**
+ * @last_dead_guc_jiffies: timestamp of previous 'dead guc' occurrance
+ * used to prevent a fundamentally broken system from continuously
+ * reloading the GuC.
+ */
+ unsigned long last_dead_guc_jiffies;
+
#ifdef CONFIG_DRM_I915_SELFTEST
/**
* @number_guc_id_stolen: The number of guc_ids that have been stolen
@@ -476,6 +490,7 @@ int intel_guc_engine_failure_process_msg(struct intel_guc *guc,
const u32 *msg, u32 len);
int intel_guc_error_capture_process_msg(struct intel_guc *guc,
const u32 *msg, u32 len);
+int intel_guc_crash_process_msg(struct intel_guc *guc, u32 action);
struct intel_engine_cs *
intel_guc_lookup_engine(struct intel_guc *guc, u8 guc_class, u8 instance);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
index 97eadd08181d6..6e22af31513a5 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
@@ -1112,12 +1112,8 @@ static int ct_process_request(struct intel_guc_ct *ct, struct ct_incoming_msg *r
ret = 0;
break;
case INTEL_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED:
- CT_ERROR(ct, "Received GuC crash dump notification!\n");
- ret = 0;
- break;
case INTEL_GUC_ACTION_NOTIFY_EXCEPTION:
- CT_ERROR(ct, "Received GuC exception notification!\n");
- ret = 0;
+ ret = intel_guc_crash_process_msg(guc, action);
break;
default:
ret = -EOPNOTSUPP;