-----Original Message----- From: Andi Shyti <andi.shyti@xxxxxxxxxxxxxxx> Sent: Wednesday, August 7, 2024 2:10 AM To: intel-gfx <intel-gfx@xxxxxxxxxxxxxxxxxxxxx>; dri-devel <dri-devel@xxxxxxxxxxxxxxxxxxxxx> Cc: Chris Wilson <chris.p.wilson@xxxxxxxxxxxxxxx>; Das, Nirmoy <nirmoy.das@xxxxxxxxx>; Cavitt, Jonathan <jonathan.cavitt@xxxxxxxxx>; Andi Shyti <andi.shyti@xxxxxxxxxxxxxxx> Subject: [PATCH] drm/i915/gt: Mark the GT as dead when mmio is unreliable > > From: Chris Wilson <chris.p.wilson@xxxxxxxxx> > > After we detect that mmio is returning all 0xff, we believe that the GPU > has dropped off the pci bus and is dead. Mark the device as wedged such > that we can propagate the failure back to userspace and wait for > recovery. > > Signed-off-by: Chris Wilson <chris.p.wilson@xxxxxxxxx> > Signed-off-by: Andi Shyti <andi.shyti@xxxxxxxxxxxxxxx> LGTM. Reviewed-by: Jonathan Cavitt <jonathan.cavitt@xxxxxxxxx> -Jonathan Cavitt > --- > drivers/gpu/drm/i915/gt/intel_gt.h | 6 ++++++ > drivers/gpu/drm/i915/gt/intel_gt_types.h | 2 ++ > drivers/gpu/drm/i915/gt/intel_reset.c | 12 +++++++++++- > drivers/gpu/drm/i915/intel_uncore.c | 7 +++++-- > 4 files changed, 24 insertions(+), 3 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h > index b5e114d284ad..b73555889d50 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gt.h > +++ b/drivers/gpu/drm/i915/gt/intel_gt.h > @@ -208,4 +208,10 @@ enum i915_map_type intel_gt_coherent_map_type(struct intel_gt *gt, > void intel_gt_bind_context_set_ready(struct intel_gt *gt); > void intel_gt_bind_context_set_unready(struct intel_gt *gt); > bool intel_gt_is_bind_context_ready(struct intel_gt *gt); > + > +static inline void intel_gt_set_wedged_async(struct intel_gt *gt) > +{ > + queue_work(system_highpri_wq, >->wedge); > +} > + > #endif /* __INTEL_GT_H__ */ > diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h > index cfdd2ad5e954..bcee084b1f27 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h > +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h > @@ -292,6 +292,8 @@ struct intel_gt { > struct gt_defaults defaults; > struct kobject *sysfs_defaults; > > + struct work_struct wedge; > + > struct i915_perf_gt perf; > > /** link: &ggtt.gt_list */ > diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c > index 735cd23a43c6..8f1ea95471ef 100644 > --- a/drivers/gpu/drm/i915/gt/intel_reset.c > +++ b/drivers/gpu/drm/i915/gt/intel_reset.c > @@ -1013,6 +1013,15 @@ static void __intel_gt_set_wedged(struct intel_gt *gt) > GT_TRACE(gt, "end\n"); > } > > +static void set_wedged_work(struct work_struct *w) > +{ > + struct intel_gt *gt = container_of(w, struct intel_gt, wedge); > + intel_wakeref_t wf; > + > + with_intel_runtime_pm(gt->uncore->rpm, wf) > + __intel_gt_set_wedged(gt); > +} > + > void intel_gt_set_wedged(struct intel_gt *gt) > { > intel_wakeref_t wakeref; > @@ -1614,6 +1623,7 @@ void intel_gt_init_reset(struct intel_gt *gt) > init_waitqueue_head(>->reset.queue); > mutex_init(>->reset.mutex); > init_srcu_struct(>->reset.backoff_srcu); > + INIT_WORK(>->wedge, set_wedged_work); > > /* > * While undesirable to wait inside the shrinker, complain anyway. > @@ -1640,7 +1650,7 @@ static void intel_wedge_me(struct work_struct *work) > struct intel_wedge_me *w = container_of(work, typeof(*w), work.work); > > gt_err(w->gt, "%s timed out, cancelling all in-flight rendering.\n", w->name); > - intel_gt_set_wedged(w->gt); > + set_wedged_work(&w->gt->wedge); > } > > void __intel_init_wedge(struct intel_wedge_me *w, > diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c > index 2eba289d88ad..6aa179a3e92a 100644 > --- a/drivers/gpu/drm/i915/intel_uncore.c > +++ b/drivers/gpu/drm/i915/intel_uncore.c > @@ -24,6 +24,7 @@ > #include <drm/drm_managed.h> > #include <linux/pm_runtime.h> > > +#include "gt/intel_gt.h" > #include "gt/intel_engine_regs.h" > #include "gt/intel_gt_regs.h" > > @@ -180,14 +181,16 @@ fw_domain_wait_ack_clear(const struct intel_uncore_forcewake_domain *d) > if (!wait_ack_clear(d, FORCEWAKE_KERNEL)) > return; > > - if (fw_ack(d) == ~0) > + if (fw_ack(d) == ~0) { > drm_err(&d->uncore->i915->drm, > "%s: MMIO unreliable (forcewake register returns 0xFFFFFFFF)!\n", > intel_uncore_forcewake_domain_to_str(d->id)); > - else > + intel_gt_set_wedged_async(d->uncore->gt); > + } else { > drm_err(&d->uncore->i915->drm, > "%s: timed out waiting for forcewake ack to clear.\n", > intel_uncore_forcewake_domain_to_str(d->id)); > + } > > add_taint_for_CI(d->uncore->i915, TAINT_WARN); /* CI now unreliable */ > } > -- > 2.45.2 > >