Re: [PATCH] drm/i915: Fix up -EIO ABI per igt/gem_eio

Daniel Vetter <daniel@xxxxxxxx> · Thu, 26 Nov 2015 12:33:24 +0100

On Thu, Nov 26, 2015 at 11:51:09AM +0100, Daniel Vetter wrote:
> My apologies to Chris Wilson for being dense on this topic for
> essentially for years.
> 
> This patch doesn't do any big redesign of the overall reset flow, but
> instead just applies changes where it's needed to obey gem_eio. We can
> redesign later on, but for now I prefer to make the testcase happy
> with some minimally invasive changes:
> 
> - Ensure that set_domain_ioctl never returns -EIO. Tricky part there
>   is that we might race with the reset handler when doing the lockless
>   wait.
> 
> - Make sure debugfs/i915_wedged is actually useful to reset a wedged
>   gpu - atm it bails out with -EAGAIN for a terminally wedged gpu.
>   That's because reset_in_progress actually includes that terminal
>   state, which is super-confusing (and most callers got this wrong).
>   Fix this by changing the semantics of reset_in_progress to do what
>   it says on the tin (and fixup all other callers).
> 
>   Also make sure that reset_in_progress is cleared when we reach the
>   terminal "wedged" state. Without this we kept returning -EAGAIN in
>   some places forever.
> 
> - Second problem with debugfs/i915_wedge was that we never got out of
>   the terminal wedged state. Hence manually set the reset counter out
>   of that state before starting the hung gpu recovery.
> 
>   For safety also make sure that we are consintent with resetting the
>   gpu between i915_reset_and_wakeup and i915_handler_error by just
>   passing the boolean "wedged" around instead of trying to recompute
>   it wrongly. This isn't an issue for gem_eio with the debugfs fix,
>   but just general robustness of the code.
> 
> - Finally make sure that when gpu reset is disabled through the module
>   paramter the kernel doesn't generate dmesg noise that would upset
>   our CI/piglit. Simplest solution for that was to lift the i915.reset
>   check up into i915_reset().
> 
> With all these changes, plus the igt fixes I've posted, gem_eio is now
> happy on many snb.
> 
> v2: Don't upset lockdep with my set_domain_ioctl changes.

Blergh, forgotten to update the commit message:

v3: Don't special case set_domain_ioctl, instead curb all -EIO at the
source in wait_request, like in Chris' patch. That means anyone waiting
for a request will not notice the -EIO and fall over due to that. We
definitely want that for modeset code.

> Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
> Testcase: igt/gem_eio/*
> Signed-off-by: Daniel Vetter <daniel.vetter@xxxxxxxxx>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c |  4 ++++
>  drivers/gpu/drm/i915/i915_drv.c     |  6 ++++++
>  drivers/gpu/drm/i915/i915_drv.h     |  2 +-
>  drivers/gpu/drm/i915/i915_gem.c     | 23 +++++++++++------------
>  drivers/gpu/drm/i915/i915_irq.c     | 11 ++++++-----
>  drivers/gpu/drm/i915/intel_uncore.c |  3 ---
>  6 files changed, 28 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 411a9c68b4ee..c4006c09ef1b 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -4744,6 +4744,10 @@ i915_wedged_set(void *data, u64 val)
>  	if (i915_reset_in_progress(&dev_priv->gpu_error))
>  		return -EAGAIN;
>  
> +	/* Already wedged, force us out of this terminal state. */
> +	if (i915_terminally_wedged(&dev_priv->gpu_error))
> +		atomic_set(&dev_priv->gpu_error.reset_counter, 0);
> +
>  	intel_runtime_pm_get(dev_priv);
>  
>  	i915_handle_error(dev, val,
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index ec1e877c4a36..1f5386bb78f4 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -909,6 +909,12 @@ int i915_reset(struct drm_device *dev)
>  
>  	simulated = dev_priv->gpu_error.stop_rings != 0;
>  
> +	if (!i915.reset) {
> +		DRM_INFO("GPU reset disabled in module option, not resetting\n");
> +		mutex_unlock(&dev->struct_mutex);
> +		return -ENODEV;
> +	}
> +
>  	ret = intel_gpu_reset(dev);
>  
>  	/* Also reset the gpu hangman. */
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index a47e0f4fab56..8876b4891d56 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2939,7 +2939,7 @@ int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
>  static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
>  {
>  	return unlikely(atomic_read(&error->reset_counter)
> -			& (I915_RESET_IN_PROGRESS_FLAG | I915_WEDGED));
> +			& I915_RESET_IN_PROGRESS_FLAG);
>  }
>  
>  static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index f10a5d57377e..64c63409b9d0 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -85,8 +85,7 @@ i915_gem_wait_for_error(struct i915_gpu_error *error)
>  {
>  	int ret;
>  
> -#define EXIT_COND (!i915_reset_in_progress(error) || \
> -		   i915_terminally_wedged(error))
> +#define EXIT_COND (!i915_reset_in_progress(error))
>  	if (EXIT_COND)
>  		return 0;
>  
> @@ -1113,16 +1112,16 @@ int
>  i915_gem_check_wedge(struct i915_gpu_error *error,
>  		     bool interruptible)
>  {
> +	/* Recovery complete, but the reset failed ... */
> +	if (i915_terminally_wedged(error))
> +		return -EIO;
> +
>  	if (i915_reset_in_progress(error)) {
>  		/* Non-interruptible callers can't handle -EAGAIN, hence return
>  		 * -EIO unconditionally for these. */
>  		if (!interruptible)
>  			return -EIO;
>  
> -		/* Recovery complete, but the reset failed ... */
> -		if (i915_terminally_wedged(error))
> -			return -EIO;
> -
>  		/*
>  		 * Check if GPU Reset is in progress - we need intel_ring_begin
>  		 * to work properly to reinit the hw state while the gpu is
> @@ -1239,11 +1238,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>  		/* We need to check whether any gpu reset happened in between
>  		 * the caller grabbing the seqno and now ... */
>  		if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) {
> -			/* ... but upgrade the -EAGAIN to an -EIO if the gpu
> -			 * is truely gone. */
> -			ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
> -			if (ret == 0)
> -				ret = -EAGAIN;
> +			ret = -EAGAIN;
>  			break;
>  		}
>  
> @@ -1577,7 +1572,7 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
>  
>  	ret = i915_mutex_lock_interruptible(dev);
>  	if (ret)
> -		return ret;
> +		goto out;
>  
>  	obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle));
>  	if (&obj->base == NULL) {
> @@ -1609,6 +1604,10 @@ unref:
>  	drm_gem_object_unreference(&obj->base);
>  unlock:
>  	mutex_unlock(&dev->struct_mutex);
> +out:
> +	/* ABI: set_domain_ioctl must not fail even when the gpu is wedged. */
> +	WARN_ON(ret == -EIO);
> +
>  	return ret;
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index c8ba94968aaf..dbbc6159584b 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2401,7 +2401,8 @@ static void i915_error_wake_up(struct drm_i915_private *dev_priv,
>   * Fire an error uevent so userspace can see that a hang or error
>   * was detected.
>   */
> -static void i915_reset_and_wakeup(struct drm_device *dev)
> +static void i915_reset_and_wakeup(struct drm_device *dev,
> +				  bool wedged)
>  {
>  	struct drm_i915_private *dev_priv = to_i915(dev);
>  	struct i915_gpu_error *error = &dev_priv->gpu_error;
> @@ -2422,7 +2423,7 @@ static void i915_reset_and_wakeup(struct drm_device *dev)
>  	 * the reset in-progress bit is only ever set by code outside of this
>  	 * work we don't need to worry about any other races.
>  	 */
> -	if (i915_reset_in_progress(error) && !i915_terminally_wedged(error)) {
> +	if (wedged) {
>  		DRM_DEBUG_DRIVER("resetting chip\n");
>  		kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE,
>  				   reset_event);
> @@ -2432,7 +2433,7 @@ static void i915_reset_and_wakeup(struct drm_device *dev)
>  		 * reference held, for example because there is a pending GPU
>  		 * request that won't finish until the reset is done. This
>  		 * isn't the case at least when we get here by doing a
> -		 * simulated reset via debugs, so get an RPM reference.
> +		 * simulated reset via debugfs, so get an RPM reference.
>  		 */
>  		intel_runtime_pm_get(dev_priv);
>  
> @@ -2467,7 +2468,7 @@ static void i915_reset_and_wakeup(struct drm_device *dev)
>  			kobject_uevent_env(&dev->primary->kdev->kobj,
>  					   KOBJ_CHANGE, reset_done_event);
>  		} else {
> -			atomic_or(I915_WEDGED, &error->reset_counter);
> +			atomic_set(&error->reset_counter, I915_WEDGED);
>  		}
>  
>  		/*
> @@ -2614,7 +2615,7 @@ void i915_handle_error(struct drm_device *dev, bool wedged,
>  		i915_error_wake_up(dev_priv, false);
>  	}
>  
> -	i915_reset_and_wakeup(dev);
> +	i915_reset_and_wakeup(dev, wedged);
>  }
>  
>  /* Called from drm generic code, passed 'crtc' which
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index c2358ba78b30..4c5ae1154dd1 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -1543,9 +1543,6 @@ not_ready:
>  
>  static int (*intel_get_gpu_reset(struct drm_device *dev))(struct drm_device *)
>  {
> -	if (!i915.reset)
> -		return NULL;
> -
>  	if (INTEL_INFO(dev)->gen >= 8)
>  		return gen8_do_reset;
>  	else if (INTEL_INFO(dev)->gen >= 6)
> -- 
> 2.1.0
> 

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
http://lists.freedesktop.org/mailman/listinfo/intel-gfx