Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes: > If we fail to reset the GPU, we declare the machine wedged. However, the > GPU may well still be running in the background with an in-flight > request. So despite our efforts in cleaning up the request queue and > faking the breadcrumb in the HWSP, the GPU may eventually write the > in-flght seqno there breaking all of our assumptions and throwing the > driver into a deep turmoil, wedging beyond wedged. > > To avoid this we ideally want to reset the GPU. Since that has already > failed, make sure the rings have the stop bit set instead. This is part > of the normal GPU reset sequence, but that is actually disabled by > igt/gem_eio to force the wedged state. If we assume the worst, we must > poke at the bit again before we give up. I am pondering if you would save so much trouble by just adding int intel_get_gpu_reset(struct drm_i915_private *dev_priv, bool ignore_modparam) and then forcing a stop engine and the last reset straight in wedging -Mika > > Testcase: igt/gem_eio > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> > Cc: Michał Winiarski <michal.winiarski@xxxxxxxxx> > Cc: Michal Wajdeczko <michal.wajdeczko@xxxxxxxxx> > Cc: Michel Thierry <michel.thierry@xxxxxxxxx> > --- > git add fail > We want to stop the tasklets before hitting stop-engines or else we > may trigger an early CS completion and more asserts. > -Chris > --- > drivers/gpu/drm/i915/i915_gem.c | 2 ++ > drivers/gpu/drm/i915/intel_engine_cs.c | 43 ++++++++++++++++++++++++++++++ > drivers/gpu/drm/i915/intel_ringbuffer.h | 2 ++ > drivers/gpu/drm/i915/intel_uncore.c | 46 +-------------------------------- > 4 files changed, 48 insertions(+), 45 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index 2fbd622bba30..208619981ea1 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -3246,6 +3246,8 @@ void i915_gem_set_wedged(struct drm_i915_private *i915) > } > i915->caps.scheduler = 0; > > + intel_engines_stop(i915, ALL_ENGINES); > + > /* > * Make sure no one is running the old callback before we proceed with > * cancelling requests and resetting the completion tracking. Otherwise > diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c > index 337dfa56a738..44eddf10f4d1 100644 > --- a/drivers/gpu/drm/i915/intel_engine_cs.c > +++ b/drivers/gpu/drm/i915/intel_engine_cs.c > @@ -1727,6 +1727,49 @@ unsigned int intel_engines_has_context_isolation(struct drm_i915_private *i915) > return which; > } > > +static void gen3_stop_engine(struct intel_engine_cs *engine) > +{ > + struct drm_i915_private *dev_priv = engine->i915; > + const u32 base = engine->mmio_base; > + const i915_reg_t mode = RING_MI_MODE(base); > + > + I915_WRITE_FW(mode, _MASKED_BIT_ENABLE(STOP_RING)); > + if (intel_wait_for_register_fw(dev_priv, > + mode, > + MODE_IDLE, > + MODE_IDLE, > + 500)) > + DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n", > + engine->name); > + > + I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base))); > + POSTING_READ_FW(RING_HEAD(base)); /* paranoia */ > + > + I915_WRITE_FW(RING_HEAD(base), 0); > + I915_WRITE_FW(RING_TAIL(base), 0); > + POSTING_READ_FW(RING_TAIL(base)); > + > + /* The ring must be empty before it is disabled */ > + I915_WRITE_FW(RING_CTL(base), 0); > + > + /* Check acts as a post */ > + if (I915_READ_FW(RING_HEAD(base)) != 0) > + DRM_DEBUG_DRIVER("%s: ring head not parked\n", > + engine->name); > +} > + > +void intel_engines_stop(struct drm_i915_private *i915, unsigned engine_mask) > +{ > + struct intel_engine_cs *engine; > + enum intel_engine_id id; > + > + if (INTEL_GEN(i915) < 3) > + return; > + > + for_each_engine_masked(engine, i915, engine_mask, id) > + gen3_stop_engine(engine); > +} > + > static void print_request(struct drm_printer *m, > struct i915_request *rq, > const char *prefix) > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h > index 1f50727a5ddb..1369f7c5b57e 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > @@ -1018,6 +1018,8 @@ bool intel_engine_has_kernel_context(const struct intel_engine_cs *engine); > void intel_engines_park(struct drm_i915_private *i915); > void intel_engines_unpark(struct drm_i915_private *i915); > > +void intel_engines_stop(struct drm_i915_private *i915, unsigned engine_mask); > + > void intel_engines_reset_default_submission(struct drm_i915_private *i915); > unsigned int intel_engines_has_context_isolation(struct drm_i915_private *i915); > > diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c > index 4df7c2ef8576..cda4c06acf16 100644 > --- a/drivers/gpu/drm/i915/intel_uncore.c > +++ b/drivers/gpu/drm/i915/intel_uncore.c > @@ -1639,50 +1639,6 @@ int i915_reg_read_ioctl(struct drm_device *dev, > return ret; > } > > -static void gen3_stop_engine(struct intel_engine_cs *engine) > -{ > - struct drm_i915_private *dev_priv = engine->i915; > - const u32 base = engine->mmio_base; > - const i915_reg_t mode = RING_MI_MODE(base); > - > - I915_WRITE_FW(mode, _MASKED_BIT_ENABLE(STOP_RING)); > - if (intel_wait_for_register_fw(dev_priv, > - mode, > - MODE_IDLE, > - MODE_IDLE, > - 500)) > - DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n", > - engine->name); > - > - I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base))); > - POSTING_READ_FW(RING_HEAD(base)); /* paranoia */ > - > - I915_WRITE_FW(RING_HEAD(base), 0); > - I915_WRITE_FW(RING_TAIL(base), 0); > - POSTING_READ_FW(RING_TAIL(base)); > - > - /* The ring must be empty before it is disabled */ > - I915_WRITE_FW(RING_CTL(base), 0); > - > - /* Check acts as a post */ > - if (I915_READ_FW(RING_HEAD(base)) != 0) > - DRM_DEBUG_DRIVER("%s: ring head not parked\n", > - engine->name); > -} > - > -static void i915_stop_engines(struct drm_i915_private *dev_priv, > - unsigned engine_mask) > -{ > - struct intel_engine_cs *engine; > - enum intel_engine_id id; > - > - if (INTEL_GEN(dev_priv) < 3) > - return; > - > - for_each_engine_masked(engine, dev_priv, engine_mask, id) > - gen3_stop_engine(engine); > -} > - > static bool i915_in_reset(struct pci_dev *pdev) > { > u8 gdrst; > @@ -2057,7 +2013,7 @@ int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask) > * > * FIXME: Wa for more modern gens needs to be validated > */ > - i915_stop_engines(dev_priv, engine_mask); > + intel_engines_stop(dev_priv, engine_mask); > > ret = -ENODEV; > if (reset) > -- > 2.16.2 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx