Trying to do a modeset from within a reset is fraught with danger. We can fall into a cyclic deadlock where the modeset is waiting on a previous modeset that is waiting on a request, and since the GPU hung that request completion is waiting on the reset. As modesetting doesn't allow its locks to be broken and restarted, or for its *own* reset mechanism to take over the display, we have to do something very evil instead. If we detect that we are stuck waiting to prepare the display reset (by using a very simple timeout), resort to cancelling all in-flight requests and throwing the user data into /dev/null, which is marginally better than the driver locking up and keeping that data to itself. Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: Maarten Lankhorst <maarten.lankhorst@xxxxxxxxxxxxxxx> Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxx> --- drivers/gpu/drm/i915/i915_gem.c | 18 ++++-------------- drivers/gpu/drm/i915/i915_irq.c | 41 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index c2213016fd86..973f4f9e6b84 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -3040,7 +3040,8 @@ static void engine_set_wedged(struct intel_engine_cs *engine) /* Mark all executing requests as skipped */ spin_lock_irqsave(&engine->timeline->lock, flags); list_for_each_entry(request, &engine->timeline->requests, link) - dma_fence_set_error(&request->fence, -EIO); + if (!i915_gem_request_completed(request)) + dma_fence_set_error(&request->fence, -EIO); spin_unlock_irqrestore(&engine->timeline->lock, flags); /* Mark all pending requests as complete so that any concurrent @@ -3079,6 +3080,7 @@ static int __i915_gem_set_wedged_BKL(void *data) struct intel_engine_cs *engine; enum intel_engine_id id; + set_bit(I915_WEDGED, &i915->gpu_error.flags); for_each_engine(engine, i915, id) engine_set_wedged(engine); @@ -3087,20 +3089,7 @@ static int __i915_gem_set_wedged_BKL(void *data) void i915_gem_set_wedged(struct drm_i915_private *dev_priv) { - lockdep_assert_held(&dev_priv->drm.struct_mutex); - set_bit(I915_WEDGED, &dev_priv->gpu_error.flags); - - /* Retire completed requests first so the list of inflight/incomplete - * requests is accurate and we don't try and mark successful requests - * as in error during __i915_gem_set_wedged_BKL(). - */ - i915_gem_retire_requests(dev_priv); - stop_machine(__i915_gem_set_wedged_BKL, dev_priv, NULL); - - i915_gem_contexts_lost(dev_priv); - - mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0); } bool i915_gem_unset_wedged(struct drm_i915_private *i915) @@ -3155,6 +3144,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915) * context and do not require stop_machine(). */ intel_engines_reset_default_submission(i915); + i915_gem_contexts_lost(i915); smp_mb__before_atomic(); /* complete takeover before enabling execbuf */ clear_bit(I915_WEDGED, &i915->gpu_error.flags); diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index bce2d1feceb1..6585bcdf61a6 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -2599,6 +2599,40 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg) return ret; } +struct wedge_me { + struct delayed_work work; + struct drm_i915_private *i915; + const char *name; +}; + +static void wedge_me(struct work_struct *work) +{ + struct wedge_me *w = container_of(work, typeof(*w), work.work); + + DRM_ERROR("%s timed out, cancelling all in-flight rendering.\n", + w->name); + i915_gem_set_wedged(w->i915); +} + +static bool init_wedge_on_timeout(struct wedge_me *w, + struct drm_i915_private *i915, + long timeout, + const char *name) +{ + w->i915 = i915; + w->name = name; + INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me); + + schedule_delayed_work(&w->work, HZ); + return true; +} + +static void destroy_wedge_on_timeout(struct wedge_me *w) +{ + cancel_delayed_work_sync(&w->work); + destroy_delayed_work_on_stack(&w->work); +} + /** * i915_reset_and_wakeup - do process context error handling work * @dev_priv: i915 device private @@ -2612,13 +2646,17 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv) char *error_event[] = { I915_ERROR_UEVENT "=1", NULL }; char *reset_event[] = { I915_RESET_UEVENT "=1", NULL }; char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL }; + struct wedge_me w; kobject_uevent_env(kobj, KOBJ_CHANGE, error_event); DRM_DEBUG_DRIVER("resetting chip\n"); kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event); - intel_prepare_reset(dev_priv); + if (init_wedge_on_timeout(&w, dev_priv, HZ, "intel_prepare_reset")) { + intel_prepare_reset(dev_priv); + destroy_wedge_on_timeout(&w); + } set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags); wake_up_all(&dev_priv->gpu_error.wait_queue); @@ -2642,6 +2680,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv) HZ)); intel_finish_reset(dev_priv); + mod_delayed_work(dev_priv->wq, &dev_priv->gt.retire_work, 0); if (!test_bit(I915_WEDGED, &dev_priv->gpu_error.flags)) kobject_uevent_env(kobj, -- 2.11.0 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx