Re: [PATCH v2 11/22] drm/i915: Perform a direct reset of the GPU from the waiter

Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> · Thu, 08 Sep 2016 12:35:24 +0300

Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes:

> If a waiter is holding the struct_mutex, then the reset worker cannot
> reset the GPU until the waiter returns. We do not want to return -EAGAIN
> form i915_wait_request as that breaks delicate operations like
> i915_vma_unbind() which often cannot be restarted easily, and returning
> -EIO is just as useless (and has in the past proven dangerous). The
> remaining WARN_ON(i915_wait_request) serve as a valuable reminder that
> handling errors from an indefinite wait are tricky.
>
> We can keep the current semantic that knowing after a reset is complete,
> so is the request, by performing the reset ourselves if we hold the
> mutex.
>
> uevent emission is still handled by the reset worker, so it may appear
> slightly out of order with respect to the actual reset (and concurrent
> use of the device).
>
> Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
> ---
>  drivers/gpu/drm/i915/i915_drv.c         | 11 ++++++-----
>  drivers/gpu/drm/i915/i915_drv.h         | 15 +++------------
>  drivers/gpu/drm/i915/i915_gem_request.c | 29 +++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/i915_irq.c         |  2 ++
>  drivers/gpu/drm/i915/intel_ringbuffer.c |  3 ---
>  5 files changed, 40 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 47a676d859db..ff4173e6e298 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1729,6 +1729,8 @@ int i915_resume_switcheroo(struct drm_device *dev)
>   * Reset the chip.  Useful if a hang is detected. Returns zero on successful
>   * reset or otherwise an error code.
>   *
> + * Caller must hold the struct_mutex.
> + *
>   * Procedure is fairly simple:
>   *   - reset the chip using the reset reg
>   *   - re-init context state
> @@ -1743,7 +1745,10 @@ int i915_reset(struct drm_i915_private *dev_priv)
>  	struct i915_gpu_error *error = &dev_priv->gpu_error;
>  	int ret;
>  
> -	mutex_lock(&dev->struct_mutex);
> +	lockdep_assert_held(&dev->struct_mutex);
> +
> +	if (!test_and_clear_bit(I915_RESET_IN_PROGRESS, &error->flags))
> +		return test_bit(I915_WEDGED, &error->flags) ? -EIO : 0;
>  
>  	/* Clear any previous failed attempts at recovery. Time to try again. */
>  	__clear_bit(I915_WEDGED, &error->flags);
> @@ -1784,9 +1789,6 @@ int i915_reset(struct drm_i915_private *dev_priv)
>  		goto error;
>  	}
>  
> -	clear_bit(I915_RESET_IN_PROGRESS, &error->flags);
> -	mutex_unlock(&dev->struct_mutex);
> -
>  	/*
>  	 * rps/rc6 re-init is necessary to restore state lost after the
>  	 * reset and the re-install of gt irqs. Skip for ironlake per
> @@ -1800,7 +1802,6 @@ int i915_reset(struct drm_i915_private *dev_priv)
>  
>  error:
>  	set_bit(I915_WEDGED, &error->flags);
> -	mutex_unlock(&dev->struct_mutex);
>  	return ret;
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index dd73c00cd774..2e2fd8a77233 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3862,7 +3862,9 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms)
>  			    schedule_timeout_uninterruptible(remaining_jiffies);
>  	}
>  }
> -static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
> +
> +static inline bool
> +__i915_request_irq_complete(struct drm_i915_gem_request *req)
>  {
>  	struct intel_engine_cs *engine = req->engine;
>  
> @@ -3924,17 +3926,6 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
>  			return true;
>  	}
>  
> -	/* We need to check whether any gpu reset happened in between
> -	 * the request being submitted and now. If a reset has occurred,
> -	 * the seqno will have been advance past ours and our request
> -	 * is complete. If we are in the process of handling a reset,
> -	 * the request is effectively complete as the rendering will
> -	 * be discarded, but we need to return in order to drop the
> -	 * struct_mutex.
> -	 */
> -	if (i915_reset_in_progress(&req->i915->gpu_error))
> -		return true;
> -
>  	return false;
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 5f89801e6a16..64c370681a81 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -533,6 +533,16 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
>  	engine->submit_request(request);
>  }
>  
> +static void reset_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&q->lock, flags);
> +	if (list_empty(&wait->task_list))
> +		__add_wait_queue(q, wait);
> +	spin_unlock_irqrestore(&q->lock, flags);
> +}
> +
>  static unsigned long local_clock_us(unsigned int *cpu)
>  {
>  	unsigned long t;
> @@ -710,6 +720,25 @@ wakeup:
>  		if (__i915_request_irq_complete(req))
>  			break;
>  
> +		/* If the GPU is hung, and we hold the lock, reset the GPU
> +		 * and then check for completion. On a full reset, the engine's
> +		 * HW seqno will be advanced passed us and we are complete.
> +		 * If we do a partial reset, we have to wait for the GPU to
> +		 * resume and update the breadcrumb.
> +		 *
> +		 * If we don't hold the mutex, we can just wait for the worker
> +		 * to come along and update the breadcrumb (either directly
> +		 * itself, or indirectly by recovering the GPU).
> +		 */
> +		if (flags & I915_WAIT_LOCKED &&
> +		    i915_reset_in_progress(&req->i915->gpu_error)) {
> +			__set_current_state(TASK_RUNNING);
> +			i915_reset(req->i915);

> +			reset_wait_queue(&req->i915->gpu_error.wait_queue,
> +					 &reset);

Comment here might be warranted that we are here due to reset wakeup,
which did clear the wait queue. Or perhap rename to
readd_to_reset_wait_queue.

Reviewed-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxx>

> +			continue;
> +		}
> +
>  		/* Only spin if we know the GPU is processing this request */
>  		if (i915_spin_request(req, state, 2))
>  			break;
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index ed172d7beecb..2c7cb5041511 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2521,7 +2521,9 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
>  	 * pending state and not properly drop locks, resulting in
>  	 * deadlocks with the reset work.
>  	 */
> +	mutex_lock(&dev_priv->drm.struct_mutex);
>  	ret = i915_reset(dev_priv);
> +	mutex_unlock(&dev_priv->drm.struct_mutex);
>  
>  	intel_finish_reset(dev_priv);
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index eae261e62b8b..e04b58a8aa0a 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -2229,9 +2229,6 @@ static int wait_for_space(struct drm_i915_gem_request *req, int bytes)
>  	if (ret)
>  		return ret;
>  
> -	if (i915_reset_in_progress(&target->i915->gpu_error))
> -		return -EAGAIN;
> -
>  	i915_gem_request_retire_upto(target);
>  
>  	intel_ring_update_space(ring);
> -- 
> 2.9.3
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx