We have two important transitions of the wedged state in the current code: - 0 -> 1: This means a hang has been detected, and signals to everyone that they please get of any locks, so that the reset work item can do its job. - 1 -> 0: The reset handler has completed. Now the last transition mixes up two states: "Reset completed and successful" and "Reset failed". To distinguish these two we do some tricks with the reset completion, but I simply could not convince myself that this doesn't race under odd circumstances. Hence split this up, and add a new terminal state indicating that the hw is gone for good. Also add explicit #defines for both states, update comments. v2: Split out the reset handling bugfix for the throttle ioctl. v3: s/tmp/wedged/ sugested by Chris Wilson. Also fixup up a rebase error which prevented this patch from actually compiling. v4: To unify the wedged state with the reset counter, keep the reset-in-progress state just as a flag. The terminally-wedged state is now denoted with a big number. v5: Add a comment to the reset_counter special values explaining that WEDGED & RESET_IN_PROGRESS needs to be true for the code to be correct. v6: Fixup logic errors introduced with the wedged+reset_counter unification. Since WEDGED implies reset-in-progress (in a way we're terminally stuck in the dead-but-reset-not-completed state), we need ensure that we check for this everywhere. The specific bug was in wait_for_error, which would simply have timed out. v7: Extract an inline i915_reset_in_progress helper to make the code more readable. Also annote the reset-in-progress case with an unlikely, to help the compiler optimize the fastpath. Do the same for the terminally wedged case with i915_terminally_wedged. Signed-Off-by: Daniel Vetter <daniel.vetter at ffwll.ch> --- drivers/gpu/drm/i915/i915_debugfs.c | 2 +- drivers/gpu/drm/i915/i915_drv.h | 40 +++++++++++++++++++++++++++-- drivers/gpu/drm/i915/i915_gem.c | 49 +++++++++++++----------------------- drivers/gpu/drm/i915/i915_irq.c | 23 +++++++++++------ drivers/gpu/drm/i915/intel_display.c | 4 +-- 5 files changed, 74 insertions(+), 44 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index ad4cdfe..2ba0236 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -1596,7 +1596,7 @@ i915_wedged_read(struct file *filp, len = snprintf(buf, sizeof(buf), "wedged : %d\n", - atomic_read(&dev_priv->gpu_error.wedged)); + atomic_read(&dev_priv->gpu_error.reset_counter)); if (len > sizeof(buf)) len = sizeof(buf); diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 6958bb0..f68de01 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -723,11 +723,37 @@ struct i915_gpu_error { /* Protected by the above dev->gpu_error.lock. */ struct drm_i915_error_state *first_error; struct work_struct work; - struct completion completion; unsigned long last_reset; - atomic_t wedged; + /** + * State variable controlling the reset flow + * + * Upper bits are for the reset counter. + * + * Lowest bit controls the reset state machine: Set means a reset is in + * progress. This state will (presuming we don't have any bugs) decay + * into either unset (successful reset) or the special WEDGED value (hw + * terminally sour). All waiters on the reset_queue will be woken when + * that happens. + */ + atomic_t reset_counter; + + /** + * Special values/flags for reset_counter + * + * Note that the code relies on + * I915_WEDGED & I915_RESET_IN_PROGRESS_FLAG + * being true. + */ +#define I915_RESET_IN_PROGRESS_FLAG 1 +#define I915_WEDGED 0xffffffff + + /** + * Waitqueue to signal when the reset has completed. Used by clients + * that wait for dev_priv->mm.wedged to settle. + */ + wait_queue_head_t reset_queue; /* For gpu hang simulation. */ unsigned int stop_rings; @@ -1457,6 +1483,16 @@ void i915_gem_retire_requests(struct drm_device *dev); void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring); int __must_check i915_gem_check_wedge(struct i915_gpu_error *error, bool interruptible); +static inline bool i915_reset_in_progress(struct i915_gpu_error *error) +{ + return unlikely(atomic_read(&error->reset_counter) + & I915_RESET_IN_PROGRESS_FLAG); +} + +static inline bool i915_terminally_wedged(struct i915_gpu_error *error) +{ + return atomic_read(&error->reset_counter) == I915_WEDGED; +} void i915_gem_reset(struct drm_device *dev); void i915_gem_clflush_object(struct drm_i915_gem_object *obj); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 55cdad9..ddad3ed 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -89,36 +89,32 @@ static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv, static int i915_gem_wait_for_error(struct i915_gpu_error *error) { - struct completion *x = &error->completion; - unsigned long flags; int ret; - if (!atomic_read(&error->wedged)) +#define EXIT_COND (!i915_reset_in_progress(error)) + if (EXIT_COND) return 0; + /* GPU is already declared terminally dead, give up. */ + if (i915_terminally_wedged(error)) + return -EIO; + /* * Only wait 10 seconds for the gpu reset to complete to avoid hanging * userspace. If it takes that long something really bad is going on and * we should simply try to bail out and fail as gracefully as possible. */ - ret = wait_for_completion_interruptible_timeout(x, 10*HZ); + ret = wait_event_interruptible_timeout(error->reset_queue, + EXIT_COND, + 10*HZ); if (ret == 0) { DRM_ERROR("Timed out waiting for the gpu reset to complete\n"); return -EIO; } else if (ret < 0) { return ret; } +#undef EXIT_COND - if (atomic_read(&error->wedged)) { - /* GPU is hung, bump the completion count to account for - * the token we just consumed so that we never hit zero and - * end up waiting upon a subsequent completion event that - * will never happen. - */ - spin_lock_irqsave(&x->wait.lock, flags); - x->done++; - spin_unlock_irqrestore(&x->wait.lock, flags); - } return 0; } @@ -943,23 +939,14 @@ int i915_gem_check_wedge(struct i915_gpu_error *error, bool interruptible) { - if (atomic_read(&error->wedged)) { - struct completion *x = &error->completion; - bool recovery_complete; - unsigned long flags; - - /* Give the error handler a chance to run. */ - spin_lock_irqsave(&x->wait.lock, flags); - recovery_complete = x->done > 0; - spin_unlock_irqrestore(&x->wait.lock, flags); - + if (i915_reset_in_progress(error)) { /* Non-interruptible callers can't handle -EAGAIN, hence return * -EIO unconditionally for these. */ if (!interruptible) return -EIO; - /* Recovery complete, but still wedged means reset failure. */ - if (recovery_complete) + /* Recovery complete, but the reset failed ... */ + if (i915_terminally_wedged(error)) return -EIO; return -EAGAIN; @@ -1026,7 +1013,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno, #define EXIT_COND \ (i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \ - atomic_read(&dev_priv->gpu_error.wedged)) + i915_reset_in_progress(&dev_priv->gpu_error)) do { if (interruptible) end = wait_event_interruptible_timeout(ring->irq_queue, @@ -1385,7 +1372,7 @@ out: /* If this -EIO is due to a gpu hang, give the reset code a * chance to clean up the mess. Otherwise return the proper * SIGBUS. */ - if (!atomic_read(&dev_priv->gpu_error.wedged)) + if (i915_terminally_wedged(&dev_priv->gpu_error)) return VM_FAULT_SIGBUS; case -EAGAIN: /* Give the error handler a chance to run and move the @@ -4032,9 +4019,9 @@ i915_gem_entervt_ioctl(struct drm_device *dev, void *data, if (drm_core_check_feature(dev, DRIVER_MODESET)) return 0; - if (atomic_read(&dev_priv->gpu_error.wedged)) { + if (i915_reset_in_progress(&dev_priv->gpu_error)) { DRM_ERROR("Reenabling wedged hardware, good luck\n"); - atomic_set(&dev_priv->gpu_error.wedged, 0); + atomic_set(&dev_priv->gpu_error.reset_counter, 0); } mutex_lock(&dev->struct_mutex); @@ -4112,7 +4099,7 @@ i915_gem_load(struct drm_device *dev) INIT_LIST_HEAD(&dev_priv->fence_regs[i].lru_list); INIT_DELAYED_WORK(&dev_priv->mm.retire_work, i915_gem_retire_work_handler); - init_completion(&dev_priv->gpu_error.completion); + init_waitqueue_head(&dev_priv->gpu_error.reset_queue); /* On GEN3 we really need to make sure the ARB C3 LP bit is set */ if (IS_GEN3(dev)) { diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 9d8921a..08e1287 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -838,8 +838,10 @@ done: */ static void i915_error_work_func(struct work_struct *work) { - drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t, - gpu_error.work); + struct i915_gpu_error *error = container_of(work, struct i915_gpu_error, + work); + drm_i915_private_t *dev_priv = container_of(error, drm_i915_private_t, + gpu_error); struct drm_device *dev = dev_priv->dev; char *error_event[] = { "ERROR=1", NULL }; char *reset_event[] = { "RESET=1", NULL }; @@ -847,14 +849,18 @@ static void i915_error_work_func(struct work_struct *work) kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, error_event); - if (atomic_read(&dev_priv->gpu_error.wedged)) { + if (i915_reset_in_progress(error)) { DRM_DEBUG_DRIVER("resetting chip\n"); kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_event); + if (!i915_reset(dev)) { - atomic_set(&dev_priv->gpu_error.wedged, 0); + atomic_set(&error->reset_counter, 0); kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_done_event); + } else { + atomic_set(&error->reset_counter, I915_WEDGED); } - complete_all(&dev_priv->gpu_error.completion); + + wake_up_all(&dev_priv->gpu_error.reset_queue); } } @@ -1434,11 +1440,12 @@ void i915_handle_error(struct drm_device *dev, bool wedged) i915_report_and_clear_eir(dev); if (wedged) { - INIT_COMPLETION(dev_priv->gpu_error.completion); - atomic_set(&dev_priv->gpu_error.wedged, 1); + atomic_set(&dev_priv->gpu_error.reset_counter, + I915_RESET_IN_PROGRESS_FLAG); /* - * Wakeup waiting processes so they don't hang + * Wakeup waiting processes so that the reset work item + * doesn't deadlock trying to grab various locks. */ for_each_ring(ring, dev_priv, i) wake_up_all(&ring->irq_queue); diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index e321c9e..c7303e1 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -2242,7 +2242,7 @@ intel_finish_fb(struct drm_framebuffer *old_fb) int ret; wait_event(dev_priv->pending_flip_queue, - atomic_read(&dev_priv->gpu_error.wedged) || + i915_reset_in_progress(&dev_priv->gpu_error) || atomic_read(&obj->pending_flip) == 0); /* Big Hammer, we also need to ensure that any pending @@ -2963,7 +2963,7 @@ static bool intel_crtc_has_pending_flip(struct drm_crtc *crtc) unsigned long flags; bool pending; - if (atomic_read(&dev_priv->gpu_error.wedged)) + if (i915_reset_in_progress(&dev_priv->gpu_error)) return false; spin_lock_irqsave(&dev->event_lock, flags); -- 1.7.11.4