My apologies to Chris Wilson for being dense on this topic for essentially for years. This patch doesn't do any big redesign of the overall reset flow, but instead just applies changes where it's needed to obey gem_eio. We can redesign later on, but for now I prefer to make the testcase happy with some minimally invasive changes: - Ensure that set_domain_ioctl never returns -EIO. Tricky part there is that we might race with the reset handler when doing the lockless wait. - Make sure debugfs/i915_wedged is actually useful to reset a wedged gpu - atm it bails out with -EAGAIN for a terminally wedged gpu. That's because reset_in_progress actually includes that terminal state, which is super-confusing (and most callers got this wrong). Fix this by changing the semantics of reset_in_progress to do what it says on the tin (and fixup all other callers). Also make sure that reset_in_progress is cleared when we reach the terminal "wedged" state. Without this we kept returning -EAGAIN in some places forever. - Second problem with debugfs/i915_wedge was that we never got out of the terminal wedged state. Hence manually set the reset counter out of that state before starting the hung gpu recovery. For safety also make sure that we are consintent with resetting the gpu between i915_reset_and_wakeup and i915_handler_error by just passing the boolean "wedged" around instead of trying to recompute it wrongly. This isn't an issue for gem_eio with the debugfs fix, but just general robustness of the code. - Finally make sure that when gpu reset is disabled through the module paramter the kernel doesn't generate dmesg noise that would upset our CI/piglit. Simplest solution for that was to lift the i915.reset check up into i915_reset(). With all these changes, plus the igt fixes I've posted, gem_eio is now happy on many snb. v2: Don't upset lockdep with my set_domain_ioctl changes. Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Testcase: igt/gem_eio/* Signed-off-by: Daniel Vetter <daniel.vetter@xxxxxxxxx> --- drivers/gpu/drm/i915/i915_debugfs.c | 4 ++++ drivers/gpu/drm/i915/i915_drv.c | 6 ++++++ drivers/gpu/drm/i915/i915_drv.h | 2 +- drivers/gpu/drm/i915/i915_gem.c | 23 +++++++++++------------ drivers/gpu/drm/i915/i915_irq.c | 11 ++++++----- drivers/gpu/drm/i915/intel_uncore.c | 3 --- 6 files changed, 28 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 411a9c68b4ee..c4006c09ef1b 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -4744,6 +4744,10 @@ i915_wedged_set(void *data, u64 val) if (i915_reset_in_progress(&dev_priv->gpu_error)) return -EAGAIN; + /* Already wedged, force us out of this terminal state. */ + if (i915_terminally_wedged(&dev_priv->gpu_error)) + atomic_set(&dev_priv->gpu_error.reset_counter, 0); + intel_runtime_pm_get(dev_priv); i915_handle_error(dev, val, diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index ec1e877c4a36..1f5386bb78f4 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -909,6 +909,12 @@ int i915_reset(struct drm_device *dev) simulated = dev_priv->gpu_error.stop_rings != 0; + if (!i915.reset) { + DRM_INFO("GPU reset disabled in module option, not resetting\n"); + mutex_unlock(&dev->struct_mutex); + return -ENODEV; + } + ret = intel_gpu_reset(dev); /* Also reset the gpu hangman. */ diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index a47e0f4fab56..8876b4891d56 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2939,7 +2939,7 @@ int __must_check i915_gem_check_wedge(struct i915_gpu_error *error, static inline bool i915_reset_in_progress(struct i915_gpu_error *error) { return unlikely(atomic_read(&error->reset_counter) - & (I915_RESET_IN_PROGRESS_FLAG | I915_WEDGED)); + & I915_RESET_IN_PROGRESS_FLAG); } static inline bool i915_terminally_wedged(struct i915_gpu_error *error) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index f10a5d57377e..64c63409b9d0 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -85,8 +85,7 @@ i915_gem_wait_for_error(struct i915_gpu_error *error) { int ret; -#define EXIT_COND (!i915_reset_in_progress(error) || \ - i915_terminally_wedged(error)) +#define EXIT_COND (!i915_reset_in_progress(error)) if (EXIT_COND) return 0; @@ -1113,16 +1112,16 @@ int i915_gem_check_wedge(struct i915_gpu_error *error, bool interruptible) { + /* Recovery complete, but the reset failed ... */ + if (i915_terminally_wedged(error)) + return -EIO; + if (i915_reset_in_progress(error)) { /* Non-interruptible callers can't handle -EAGAIN, hence return * -EIO unconditionally for these. */ if (!interruptible) return -EIO; - /* Recovery complete, but the reset failed ... */ - if (i915_terminally_wedged(error)) - return -EIO; - /* * Check if GPU Reset is in progress - we need intel_ring_begin * to work properly to reinit the hw state while the gpu is @@ -1239,11 +1238,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req, /* We need to check whether any gpu reset happened in between * the caller grabbing the seqno and now ... */ if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) { - /* ... but upgrade the -EAGAIN to an -EIO if the gpu - * is truely gone. */ - ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible); - if (ret == 0) - ret = -EAGAIN; + ret = -EAGAIN; break; } @@ -1577,7 +1572,7 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data, ret = i915_mutex_lock_interruptible(dev); if (ret) - return ret; + goto out; obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle)); if (&obj->base == NULL) { @@ -1609,6 +1604,10 @@ unref: drm_gem_object_unreference(&obj->base); unlock: mutex_unlock(&dev->struct_mutex); +out: + /* ABI: set_domain_ioctl must not fail even when the gpu is wedged. */ + WARN_ON(ret == -EIO); + return ret; } diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index c8ba94968aaf..dbbc6159584b 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -2401,7 +2401,8 @@ static void i915_error_wake_up(struct drm_i915_private *dev_priv, * Fire an error uevent so userspace can see that a hang or error * was detected. */ -static void i915_reset_and_wakeup(struct drm_device *dev) +static void i915_reset_and_wakeup(struct drm_device *dev, + bool wedged) { struct drm_i915_private *dev_priv = to_i915(dev); struct i915_gpu_error *error = &dev_priv->gpu_error; @@ -2422,7 +2423,7 @@ static void i915_reset_and_wakeup(struct drm_device *dev) * the reset in-progress bit is only ever set by code outside of this * work we don't need to worry about any other races. */ - if (i915_reset_in_progress(error) && !i915_terminally_wedged(error)) { + if (wedged) { DRM_DEBUG_DRIVER("resetting chip\n"); kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, reset_event); @@ -2432,7 +2433,7 @@ static void i915_reset_and_wakeup(struct drm_device *dev) * reference held, for example because there is a pending GPU * request that won't finish until the reset is done. This * isn't the case at least when we get here by doing a - * simulated reset via debugs, so get an RPM reference. + * simulated reset via debugfs, so get an RPM reference. */ intel_runtime_pm_get(dev_priv); @@ -2467,7 +2468,7 @@ static void i915_reset_and_wakeup(struct drm_device *dev) kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, reset_done_event); } else { - atomic_or(I915_WEDGED, &error->reset_counter); + atomic_set(&error->reset_counter, I915_WEDGED); } /* @@ -2614,7 +2615,7 @@ void i915_handle_error(struct drm_device *dev, bool wedged, i915_error_wake_up(dev_priv, false); } - i915_reset_and_wakeup(dev); + i915_reset_and_wakeup(dev, wedged); } /* Called from drm generic code, passed 'crtc' which diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c index c2358ba78b30..4c5ae1154dd1 100644 --- a/drivers/gpu/drm/i915/intel_uncore.c +++ b/drivers/gpu/drm/i915/intel_uncore.c @@ -1543,9 +1543,6 @@ not_ready: static int (*intel_get_gpu_reset(struct drm_device *dev))(struct drm_device *) { - if (!i915.reset) - return NULL; - if (INTEL_INFO(dev)->gen >= 8) return gen8_do_reset; else if (INTEL_INFO(dev)->gen >= 6) -- 2.1.0 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx