Re: [PATCH 09/32] drm/i915: Prevent leaking of -EIO from i915_wait_request()

Daniel Vetter <daniel@xxxxxxxx> · Wed, 16 Dec 2015 10:52:06 +0100

On Fri, Dec 11, 2015 at 11:33:05AM +0000, Chris Wilson wrote:
> Reporting -EIO from i915_wait_request() has proven very troublematic
> over the years, with numerous hard-to-reproduce bugs cropping up in the
> corner case of where a reset occurs and the code wasn't expecting such
> an error.
> 
> If the we reset the GPU or have detected a hang and wish to reset the
> GPU, the request is forcibly complete and the wait broken. Currently, we
> report either -EAGAIN or -EIO in order for the caller to retreat and
> restart the wait (if appropriate) after dropping and then reacquiring
> the struct_mutex (essential to allow the GPU reset to proceed). However,
> if we take the view that the request is complete (no further work will
> be done on it by the GPU because it is dead and soon to be reset), then
> we can proceed with the task at hand and then drop the struct_mutex
> allowing the reset to occur. This transfers the burden of checking
> whether it is safe to proceed to the caller, which in all but one
> instance it is safe - completely eliminating the source of all spurious
> -EIO.
> 
> Of note, we only have two API entry points where we expect that
> userspace can observe an EIO. First is when submitting an execbuf, if
> the GPU is terminally wedged, then the operation cannot succeed and an
> -EIO is reported. Secondly, existing userspace uses the throttle ioctl
> to detect an already wedged GPU before starting using HW acceleration
> (or to confirm that the GPU is wedged after an error condition). So if
> the GPU is wedged when the user calls throttle, also report -EIO.
> 
> v2: Split more carefully the change to i915_wait_request() and assorted
> ABI from the reset handling.
> 
> Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
> Cc: Daniel Vetter <daniel.vetter@xxxxxxxx>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  2 --
>  drivers/gpu/drm/i915/i915_gem.c         | 44 ++++++++++++++++-----------------
>  drivers/gpu/drm/i915/i915_gem_userptr.c |  6 ++---
>  drivers/gpu/drm/i915/intel_display.c    |  8 +-----
>  drivers/gpu/drm/i915/intel_lrc.c        |  2 +-
>  drivers/gpu/drm/i915/intel_ringbuffer.c |  2 +-
>  6 files changed, 27 insertions(+), 37 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index f30c305a6889..7acbc072973a 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2987,8 +2987,6 @@ i915_gem_find_active_request(struct intel_engine_cs *ring);
>  
>  bool i915_gem_retire_requests(struct drm_device *dev);
>  void i915_gem_retire_requests_ring(struct intel_engine_cs *ring);
> -int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
> -				      bool interruptible);
>  
>  static inline u32 i915_reset_counter(struct i915_gpu_error *error)
>  {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index b17cc0e42a4f..f5760869a17c 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -208,11 +208,10 @@ i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj)
>  	BUG_ON(obj->madv == __I915_MADV_PURGED);
>  
>  	ret = i915_gem_object_set_to_cpu_domain(obj, true);
> -	if (ret) {
> +	if (WARN_ON(ret)) {
>  		/* In the event of a disaster, abandon all caches and
>  		 * hope for the best.
>  		 */
> -		WARN_ON(ret != -EIO);
>  		obj->base.read_domains = obj->base.write_domain = I915_GEM_DOMAIN_CPU;
>  	}
>  
> @@ -1106,15 +1105,13 @@ put_rpm:
>  	return ret;
>  }
>  
> -int
> -i915_gem_check_wedge(struct i915_gpu_error *error,
> -		     bool interruptible)
> +static int
> +i915_gem_check_wedge(unsigned reset_counter, bool interruptible)
>  {
> -	if (i915_reset_in_progress_or_wedged(error)) {
> -		/* Recovery complete, but the reset failed ... */
> -		if (i915_terminally_wedged(error))
> -			return -EIO;
> +	if (__i915_terminally_wedged(reset_counter))
> +		return -EIO;
>  
> +	if (__i915_reset_in_progress(reset_counter)) {
>  		/* Non-interruptible callers can't handle -EAGAIN, hence return
>  		 * -EIO unconditionally for these. */
>  		if (!interruptible)
> @@ -1285,13 +1282,14 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>  		prepare_to_wait(&ring->irq_queue, &wait, state);
>  
>  		/* We need to check whether any gpu reset happened in between
> -		 * the caller grabbing the seqno and now ... */
> +		 * the request being submitted and now. If a reset has occurred,
> +		 * the request is effectively complete (we either are in the
> +		 * process of or have discarded the rendering and completely
> +		 * reset the GPU. The results of the request are lost and we
> +		 * are free to continue on with the original operation.
> +		 */
>  		if (req->reset_counter != i915_reset_counter(&dev_priv->gpu_error)) {
> -			/* ... but upgrade the -EAGAIN to an -EIO if the gpu
> -			 * is truely gone. */
> -			ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
> -			if (ret == 0)
> -				ret = -EAGAIN;
> +			ret = 0;
>  			break;
>  		}
>  
> @@ -2154,11 +2152,10 @@ i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj)
>  	BUG_ON(obj->madv == __I915_MADV_PURGED);
>  
>  	ret = i915_gem_object_set_to_cpu_domain(obj, true);
> -	if (ret) {
> +	if (WARN_ON(ret)) {
>  		/* In the event of a disaster, abandon all caches and
>  		 * hope for the best.
>  		 */
> -		WARN_ON(ret != -EIO);
>  		i915_gem_clflush_object(obj, true);
>  		obj->base.read_domains = obj->base.write_domain = I915_GEM_DOMAIN_CPU;
>  	}
> @@ -2678,8 +2675,11 @@ int i915_gem_request_alloc(struct intel_engine_cs *ring,
>  
>  	*req_out = NULL;
>  
> -	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
> -				   dev_priv->mm.interruptible);
> +	/* ABI: Before userspace accesses the GPU (e.g. execbuffer), report
> +	 * EIO if the GPU is already wedged, or EAGAIN to drop the struct_mutex
> +	 * and restart.
> +	 */
> +	ret = i915_gem_check_wedge(reset_counter, dev_priv->mm.interruptible);
>  	if (ret)
>  		return ret;
>  
> @@ -4093,9 +4093,9 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
>  	if (ret)
>  		return ret;
>  
> -	ret = i915_gem_check_wedge(&dev_priv->gpu_error, false);
> -	if (ret)
> -		return ret;
> +	/* ABI: return -EIO if already wedged */
> +	if (i915_terminally_wedged(&dev_priv->gpu_error))
> +		return -EIO;
>  
>  	spin_lock(&file_priv->mm.lock);
>  	list_for_each_entry(request, &file_priv->mm.request_list, client_list) {
> diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
> index 19fb0bddc1cd..1a5f89dba4af 100644
> --- a/drivers/gpu/drm/i915/i915_gem_userptr.c
> +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
> @@ -81,10 +81,8 @@ static void __cancel_userptr__worker(struct work_struct *work)
>  		was_interruptible = dev_priv->mm.interruptible;
>  		dev_priv->mm.interruptible = false;
>  
> -		list_for_each_entry_safe(vma, tmp, &obj->vma_list, vma_link) {
> -			int ret = i915_vma_unbind(vma);
> -			WARN_ON(ret && ret != -EIO);
> -		}
> +		list_for_each_entry_safe(vma, tmp, &obj->vma_list, vma_link)
> +			WARN_ON(i915_vma_unbind(vma));
>  		WARN_ON(i915_gem_object_put_pages(obj));
>  
>  		dev_priv->mm.interruptible = was_interruptible;
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index d7bbd015de35..875bdf814d73 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -13405,10 +13405,6 @@ static int intel_atomic_prepare_commit(struct drm_device *dev,
>  
>  			ret = __i915_wait_request(intel_plane_state->wait_req,
>  						  true, NULL, NULL);
> -
> -			/* Swallow -EIO errors to allow updates during hw lockup. */
> -			if (ret == -EIO)

I'd like to keep a WARN_ON here since it's ABI relevant, and consistent
with the other places we've had a WARN_ON(-EIO).

> -				ret = 0;
>  			if (ret) {
>  				mutex_lock(&dev->struct_mutex);
>  				drm_atomic_helper_cleanup_planes(dev, state);
> @@ -13744,9 +13740,7 @@ intel_prepare_plane_fb(struct drm_plane *plane,
>  		 */
>  		if (needs_modeset(crtc_state))
>  			ret = i915_gem_object_wait_rendering(old_obj, true);
> -
> -		/* Swallow -EIO errors to allow updates during hw lockup. */
> -		if (ret && ret != -EIO)
> +		if (ret)

Same here really.

We already have a WARN_ON in the mmio_flip, so are covered there already.

Otherwise looks good, and with the above addressed and under the
assumption that the last caller of check_wedge is in request_alloc (which
I was too lazy to check perfectly):

Reviewed-by: Daniel Vetter <daniel.vetter@xxxxxxxx>

>  			return ret;
>  	}
>  
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 69b298eed69d..7030ff526941 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -976,7 +976,7 @@ void intel_logical_ring_stop(struct intel_engine_cs *ring)
>  		return;
>  
>  	ret = intel_ring_idle(ring);
> -	if (ret && !i915_reset_in_progress_or_wedged(&to_i915(ring->dev)->gpu_error))
> +	if (ret)
>  		DRM_ERROR("failed to quiesce %s whilst cleaning up: %d\n",
>  			  ring->name, ret);
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 942f86b316c2..6cecc15ec01b 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -3062,7 +3062,7 @@ intel_stop_ring_buffer(struct intel_engine_cs *ring)
>  		return;
>  
>  	ret = intel_ring_idle(ring);
> -	if (ret && !i915_reset_in_progress_or_wedged(&to_i915(ring->dev)->gpu_error))
> +	if (ret)
>  		DRM_ERROR("failed to quiesce %s whilst cleaning up: %d\n",
>  			  ring->name, ret);
>  
> -- 
> 2.6.3
> 

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
http://lists.freedesktop.org/mailman/listinfo/intel-gfx