On Mon, Sep 19, 2016 at 04:30:14PM +0100, Matthew Auld wrote: > From: "arun.siluvery@xxxxxxxxxxxxxxx" <arun.siluvery@xxxxxxxxxxxxxxx> > > This is a preparatory patch which modifies error handler to do per engine > hang recovery. The actual patch which implements this sequence follows > later in the series. The aim is to prepare existing recovery function to > adapt to this new function where applicable (which fails at this point > because core implementation is lacking) and continue recovery using legacy > full gpu reset. > > A helper function is also added to query the availability of engine > reset. > > The error events behaviour that are used to notify user of reset are > adapted to engine reset such that it doesn't break users listening to these > events. In legacy we report an error event, a reset event before resetting > the gpu and a reset done event marking the completion of reset. The same > behaviour is adapted but reset event is only dispatched once even when > multiple engines are hung. Finally once reset is complete we send reset > done event as usual. > > v2 > - rework slightly > - prefer INTEL_GEN > - document engine_mask param > > Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> > Signed-off-by: Ian Lister <ian.lister@xxxxxxxxx> > Signed-off-by: Tomas Elf <tomas.elf@xxxxxxxxx> > Signed-off-by: Arun Siluvery <arun.siluvery@xxxxxxxxxxxxxxx> > Signed-off-by: Matthew Auld <matthew.auld@xxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_drv.c | 26 +++++++++++++++++++ > drivers/gpu/drm/i915/i915_drv.h | 2 ++ > drivers/gpu/drm/i915/i915_irq.c | 50 +++++++++++++++++++++++++++++++++---- > drivers/gpu/drm/i915/intel_uncore.c | 5 ++++ > 4 files changed, 78 insertions(+), 5 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c > index 7f4e8ad..99fa690 100644 > --- a/drivers/gpu/drm/i915/i915_drv.c > +++ b/drivers/gpu/drm/i915/i915_drv.c > @@ -1804,6 +1804,32 @@ error: > goto wakeup; > } > > +/** > + * i915_reset_engine - reset GPU engine to recover from a hang > + * @engine: engine to reset > + * > + * Reset a specific GPU engine. Useful if a hang is detected. > + * Returns zero on successful reset or otherwise an error code. > + * > + * Procedure is fairly simple: > + * - force engine to idle > + * - save current state which includes head and current request > + * - reset engine > + * - restore saved state and resubmit context > + */ > +int i915_reset_engine(struct intel_engine_cs *engine) > +{ > + int ret; > + struct drm_i915_private *dev_priv = engine->i915; > + > + /* FIXME: replace me with engine reset sequence */ > + ret = -ENODEV; > + > + set_bit(I915_RESET_IN_PROGRESS, &dev_priv->gpu_error.flags); > + > + return ret; > +} > + > static int i915_pm_suspend(struct device *kdev) > { > struct pci_dev *pdev = to_pci_dev(kdev); > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 4dd307e..79de74d 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -2888,6 +2888,8 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd, > extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask); > extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv); > extern void i915_reset(struct drm_i915_private *dev_priv); > +extern bool intel_has_engine_reset(struct drm_i915_private *dev_priv); > +extern int i915_reset_engine(struct intel_engine_cs *engine); > extern int intel_guc_reset(struct drm_i915_private *dev_priv); > extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine); > extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv); > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c > index c128fdb..45c5a26 100644 > --- a/drivers/gpu/drm/i915/i915_irq.c > +++ b/drivers/gpu/drm/i915/i915_irq.c > @@ -2487,11 +2487,13 @@ static void i915_error_wake_up(struct drm_i915_private *dev_priv) > /** > * i915_reset_and_wakeup - do process context error handling work > * @dev_priv: i915 device private > + * @engine_mask: engines which are marked as hung > * > * Fire an error uevent so userspace can see that a hang or error > * was detected. > */ > -static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv) > +static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv, > + u32 engine_mask) > { > struct kobject *kobj = &dev_priv->drm.primary->kdev->kobj; > char *error_event[] = { I915_ERROR_UEVENT "=1", NULL }; > @@ -2501,6 +2503,15 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv) > kobject_uevent_env(kobj, KOBJ_CHANGE, error_event); > > DRM_DEBUG_DRIVER("resetting chip\n"); > + /* > + * This event needs to be sent before performing gpu reset. When > + * engine resets are supported we iterate through all engines and > + * reset hung engines individually. To keep the event dispatch > + * mechanism consistent with full gpu reset, this is only sent once > + * even when multiple engines are hung. It is also safe to move this > + * here because when we are in this function, we will definitely > + * perform gpu reset. > + */ > kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event); > > /* > @@ -2511,6 +2522,28 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv) > * simulated reset via debugs, so get an RPM reference. > */ > intel_runtime_pm_get(dev_priv); > + > + /* > + * First attempt to reset the engines which are marked as hung. If that > + * fails then we fallback to doing a full gpu reset. > + */ > + if (intel_has_engine_reset(dev_priv)) { > + struct intel_engine_cs *engine; > + unsigned int tmp; > + int ret = 0; > + > + for_each_engine_masked(engine, dev_priv, engine_mask, tmp) { > + ret = i915_reset_engine(engine); > + if (ret) > + break; > + } > + > + if (!ret) { > + DRM_DEBUG_DRIVER("reset hung engines.\n"); > + goto reset_done; > + } > + } > + > intel_prepare_reset(dev_priv); > > do { > @@ -2532,6 +2565,8 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv) > HZ)); > > intel_finish_reset(dev_priv); > + > +reset_done: > intel_runtime_pm_put(dev_priv); > > if (!test_bit(I915_WEDGED, &dev_priv->gpu_error.flags)) > @@ -2640,6 +2675,8 @@ static void i915_report_and_clear_eir(struct drm_i915_private *dev_priv) > * i915_handle_error - handle a gpu error > * @dev_priv: i915 device private > * @engine_mask: mask representing engines that are hung > + * @fmt: formatted hang msg that gets logged in captured error state > + * > * Do some basic checking of register state at error time and > * dump it to the syslog. Also call i915_capture_error_state() to make > * sure we get a record and make it available in debugfs. Fire a uevent > @@ -2664,9 +2701,12 @@ void i915_handle_error(struct drm_i915_private *dev_priv, > if (!engine_mask) > return; > > - if (test_and_set_bit(I915_RESET_IN_PROGRESS, > - &dev_priv->gpu_error.flags)) > - return; No. This is a serialised test for a reason. -Chris -- Chris Wilson, Intel Open Source Technology Centre _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx