Re: [PATCH 1/2] drm/i915: Apply i915_request_skip() on submission

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes:

> Trying to use i915_request_skip() prior to i915_request_add() causes us
> to try and fill the ring upto request->postfix, which has not yet been
> set, and so may cause us to memset() past the end of the ring.
>
> Instead of skipping the request immediately, just flag the error on the
> request (only accepting the first fatal error we see) and then clear the
> request upon submission.
>
> Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
> Cc: Matthew Auld <matthew.auld@xxxxxxxxx>

I am now convinced that if skip happens in a wrong spot,
we really might end up filling up zeros in ring when
we shouldn't

Reviewed-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx>


> ---
>  .../gpu/drm/i915/gem/i915_gem_client_blt.c    |  2 +-
>  .../gpu/drm/i915/gem/i915_gem_execbuffer.c    |  6 +-
>  .../gpu/drm/i915/gem/i915_gem_object_blt.c    |  4 +-
>  .../drm/i915/gem/selftests/i915_gem_context.c |  6 +-
>  .../drm/i915/gem/selftests/igt_gem_utils.c    |  2 +-
>  drivers/gpu/drm/i915/gt/intel_lrc.c           |  4 +-
>  drivers/gpu/drm/i915/gt/intel_reset.c         | 13 ++--
>  .../gpu/drm/i915/gt/intel_ring_submission.c   |  4 +-
>  drivers/gpu/drm/i915/gt/mock_engine.c         |  4 +-
>  drivers/gpu/drm/i915/gt/selftest_hangcheck.c  |  2 +-
>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  4 +-
>  drivers/gpu/drm/i915/i915_request.c           | 67 +++++++++++++------
>  drivers/gpu/drm/i915/i915_request.h           |  5 +-
>  drivers/gpu/drm/i915/selftests/igt_spinner.c  |  2 +-
>  14 files changed, 75 insertions(+), 50 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c b/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c
> index 81366aa4812b..0598e5382a1d 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c
> @@ -217,7 +217,7 @@ static void clear_pages_worker(struct work_struct *work)
>  					   0);
>  out_request:
>  	if (unlikely(err)) {
> -		i915_request_skip(rq, err);
> +		i915_request_set_error_once(rq, err);
>  		err = 0;
>  	}
>  
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> index a1636c168e1f..bb27813ad5bf 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> @@ -1169,7 +1169,7 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
>  	goto out_pool;
>  
>  skip_request:
> -	i915_request_skip(rq, err);
> +	i915_request_set_error_once(rq, err);
>  err_request:
>  	i915_request_add(rq);
>  err_unpin:
> @@ -1850,7 +1850,7 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb)
>  	return 0;
>  
>  err_skip:
> -	i915_request_skip(eb->request, err);
> +	i915_request_set_error_once(eb->request, err);
>  	return err;
>  }
>  
> @@ -2579,7 +2579,7 @@ static void eb_request_add(struct i915_execbuffer *eb)
>  			attr.priority |= I915_PRIORITY_WAIT;
>  	} else {
>  		/* Serialise with context_close via the add_to_timeline */
> -		i915_request_skip(rq, -ENOENT);
> +		i915_request_set_error_once(rq, -ENOENT);
>  	}
>  
>  	local_bh_disable();
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c b/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c
> index 70809d8897cd..39b8a055d80a 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c
> @@ -186,7 +186,7 @@ int i915_gem_object_fill_blt(struct drm_i915_gem_object *obj,
>  					0);
>  out_request:
>  	if (unlikely(err))
> -		i915_request_skip(rq, err);
> +		i915_request_set_error_once(rq, err);
>  
>  	i915_request_add(rq);
>  out_batch:
> @@ -385,7 +385,7 @@ int i915_gem_object_copy_blt(struct drm_i915_gem_object *src,
>  	drm_gem_unlock_reservations(objs, ARRAY_SIZE(objs), &acquire);
>  out_request:
>  	if (unlikely(err))
> -		i915_request_skip(rq, err);
> +		i915_request_set_error_once(rq, err);
>  
>  	i915_request_add(rq);
>  out_batch:
> diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
> index 375d864736f3..77c7e65de7c3 100644
> --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
> @@ -1004,7 +1004,7 @@ emit_rpcs_query(struct drm_i915_gem_object *obj,
>  	return 0;
>  
>  skip_request:
> -	i915_request_skip(rq, err);
> +	i915_request_set_error_once(rq, err);
>  err_request:
>  	i915_request_add(rq);
>  err_batch:
> @@ -1559,7 +1559,7 @@ static int write_to_scratch(struct i915_gem_context *ctx,
>  
>  	goto out_vm;
>  skip_request:
> -	i915_request_skip(rq, err);
> +	i915_request_set_error_once(rq, err);
>  err_request:
>  	i915_request_add(rq);
>  err_unpin:
> @@ -1708,7 +1708,7 @@ static int read_from_scratch(struct i915_gem_context *ctx,
>  
>  	goto out_vm;
>  skip_request:
> -	i915_request_skip(rq, err);
> +	i915_request_set_error_once(rq, err);
>  err_request:
>  	i915_request_add(rq);
>  err_unpin:
> diff --git a/drivers/gpu/drm/i915/gem/selftests/igt_gem_utils.c b/drivers/gpu/drm/i915/gem/selftests/igt_gem_utils.c
> index 6718da20f35d..772d8cba7da9 100644
> --- a/drivers/gpu/drm/i915/gem/selftests/igt_gem_utils.c
> +++ b/drivers/gpu/drm/i915/gem/selftests/igt_gem_utils.c
> @@ -159,7 +159,7 @@ int igt_gpu_fill_dw(struct intel_context *ce,
>  	return 0;
>  
>  skip_request:
> -	i915_request_skip(rq, err);
> +	i915_request_set_error_once(rq, err);
>  err_request:
>  	i915_request_add(rq);
>  err_batch:
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index b9b3f78f1324..d123dd7fe98b 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -245,7 +245,7 @@ static void mark_eio(struct i915_request *rq)
>  
>  	GEM_BUG_ON(i915_request_signaled(rq));
>  
> -	dma_fence_set_error(&rq->fence, -EIO);
> +	i915_request_set_error_once(rq, -EIO);
>  	i915_request_mark_complete(rq);
>  }
>  
> @@ -4903,7 +4903,7 @@ static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
>  	mask = rq->execution_mask;
>  	if (unlikely(!mask)) {
>  		/* Invalid selection, submit to a random engine in error */
> -		i915_request_skip(rq, -ENODEV);
> +		i915_request_set_error_once(rq, -ENODEV);
>  		mask = ve->siblings[0]->mask;
>  	}
>  
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> index aef6ab58d7d9..10ad816b32e2 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -48,8 +48,10 @@ static void engine_skip_context(struct i915_request *rq)
>  
>  	lockdep_assert_held(&engine->active.lock);
>  	list_for_each_entry_continue(rq, &engine->active.requests, sched.link)
> -		if (rq->context == hung_ctx)
> -			i915_request_skip(rq, -EIO);
> +		if (rq->context == hung_ctx) {
> +			i915_request_set_error_once(rq, -EIO);
> +			__i915_request_skip(rq);
> +		}
>  }
>  
>  static void client_mark_guilty(struct i915_gem_context *ctx, bool banned)
> @@ -154,11 +156,12 @@ void __i915_request_reset(struct i915_request *rq, bool guilty)
>  
>  	rcu_read_lock(); /* protect the GEM context */
>  	if (guilty) {
> -		i915_request_skip(rq, -EIO);
> +		i915_request_set_error_once(rq, -EIO);
> +		__i915_request_skip(rq);
>  		if (mark_guilty(rq))
>  			engine_skip_context(rq);
>  	} else {
> -		dma_fence_set_error(&rq->fence, -EAGAIN);
> +		i915_request_set_error_once(rq, -EAGAIN);
>  		mark_innocent(rq);
>  	}
>  	rcu_read_unlock();
> @@ -785,7 +788,7 @@ static void nop_submit_request(struct i915_request *request)
>  	unsigned long flags;
>  
>  	RQ_TRACE(request, "-EIO\n");
> -	dma_fence_set_error(&request->fence, -EIO);
> +	i915_request_set_error_once(request, -EIO);
>  
>  	spin_lock_irqsave(&engine->active.lock, flags);
>  	__i915_request_submit(request);
> diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> index fee743626060..ee241b7eaa3b 100644
> --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> @@ -895,9 +895,7 @@ static void reset_cancel(struct intel_engine_cs *engine)
>  
>  	/* Mark all submitted requests as skipped. */
>  	list_for_each_entry(request, &engine->active.requests, sched.link) {
> -		if (!i915_request_signaled(request))
> -			dma_fence_set_error(&request->fence, -EIO);
> -
> +		i915_request_set_error_once(request, -EIO);
>  		i915_request_mark_complete(request);
>  	}
>  
> diff --git a/drivers/gpu/drm/i915/gt/mock_engine.c b/drivers/gpu/drm/i915/gt/mock_engine.c
> index 5633515c12e9..4a53ded7c2dd 100644
> --- a/drivers/gpu/drm/i915/gt/mock_engine.c
> +++ b/drivers/gpu/drm/i915/gt/mock_engine.c
> @@ -244,9 +244,7 @@ static void mock_reset_cancel(struct intel_engine_cs *engine)
>  
>  	/* Mark all submitted requests as skipped. */
>  	list_for_each_entry(request, &engine->active.requests, sched.link) {
> -		if (!i915_request_signaled(request))
> -			dma_fence_set_error(&request->fence, -EIO);
> -
> +		i915_request_set_error_once(request, -EIO);
>  		i915_request_mark_complete(request);
>  	}
>  
> diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> index c3514ec7b8db..2b2efff6e19d 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> @@ -268,7 +268,7 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine)
>  
>  cancel_rq:
>  	if (err) {
> -		i915_request_skip(rq, err);
> +		i915_request_set_error_once(rq, err);
>  		i915_request_add(rq);
>  	}
>  unpin_hws:
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 1beaa77f9bb6..fe7778c28d2d 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -456,9 +456,7 @@ static void guc_reset_cancel(struct intel_engine_cs *engine)
>  
>  	/* Mark all executing requests as skipped. */
>  	list_for_each_entry(rq, &engine->active.requests, sched.link) {
> -		if (!i915_request_signaled(rq))
> -			dma_fence_set_error(&rq->fence, -EIO);
> -
> +		i915_request_set_error_once(rq, -EIO);
>  		i915_request_mark_complete(rq);
>  	}
>  
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index d837c1380015..e88b027f80c1 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -363,6 +363,48 @@ __await_execution(struct i915_request *rq,
>  	return 0;
>  }
>  
> +void __i915_request_skip(struct i915_request *rq)
> +{
> +	if (rq->infix == rq->postfix)
> +		return;
> +
> +	/*
> +	 * As this request likely depends on state from the lost
> +	 * context, clear out all the user operations leaving the
> +	 * breadcrumb at the end (so we get the fence notifications).
> +	 */
> +	__i915_request_fill(rq, 0);
> +	rq->infix = rq->postfix;
> +}
> +
> +static bool fatal_error(int error)
> +{
> +	switch (error) {
> +	case 0: /* not an error! */
> +	case -EAGAIN: /* innocent victim of a GT reset (__i915_request_reset) */
> +	case -ETIMEDOUT: /* waiting for a Godot (timer_i915_sw_fence_wake) */
> +		return false;
> +	default:
> +		return true;
> +	}
> +}
> +
> +void i915_request_set_error_once(struct i915_request *rq, int error)
> +{
> +	int old;
> +
> +	GEM_BUG_ON(!IS_ERR_VALUE((long)error));
> +
> +	if (i915_request_signaled(rq))
> +		return;
> +
> +	old = READ_ONCE(rq->fence.error);
> +	do {
> +		if (fatal_error(old))
> +			return;
> +	} while (!try_cmpxchg(&rq->fence.error, &old, error));
> +}
> +
>  bool __i915_request_submit(struct i915_request *request)
>  {
>  	struct intel_engine_cs *engine = request->engine;
> @@ -392,8 +434,10 @@ bool __i915_request_submit(struct i915_request *request)
>  	if (i915_request_completed(request))
>  		goto xfer;
>  
> -	if (intel_context_is_banned(request->context))
> -		i915_request_skip(request, -EIO);
> +	if (unlikely(intel_context_is_banned(request->context)))
> +		i915_request_set_error_once(request, -EIO);
> +	if (unlikely(fatal_error(request->fence.error)))
> +		__i915_request_skip(request);
>  
>  	/*
>  	 * Are we using semaphores when the gpu is already saturated?
> @@ -519,7 +563,7 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
>  		trace_i915_request_submit(request);
>  
>  		if (unlikely(fence->error))
> -			i915_request_skip(request, fence->error);
> +			i915_request_set_error_once(request, fence->error);
>  
>  		/*
>  		 * We need to serialize use of the submit_request() callback
> @@ -1209,23 +1253,6 @@ i915_request_await_object(struct i915_request *to,
>  	return ret;
>  }
>  
> -void i915_request_skip(struct i915_request *rq, int error)
> -{
> -	GEM_BUG_ON(!IS_ERR_VALUE((long)error));
> -	dma_fence_set_error(&rq->fence, error);
> -
> -	if (rq->infix == rq->postfix)
> -		return;
> -
> -	/*
> -	 * As this request likely depends on state from the lost
> -	 * context, clear out all the user operations leaving the
> -	 * breadcrumb at the end (so we get the fence notifications).
> -	 */
> -	__i915_request_fill(rq, 0);
> -	rq->infix = rq->postfix;
> -}
> -
>  static struct i915_request *
>  __i915_request_add_to_timeline(struct i915_request *rq)
>  {
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index da8420f03232..d4bae16b4785 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -303,6 +303,9 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp);
>  struct i915_request * __must_check
>  i915_request_create(struct intel_context *ce);
>  
> +void i915_request_set_error_once(struct i915_request *rq, int error);
> +void __i915_request_skip(struct i915_request *rq);
> +
>  struct i915_request *__i915_request_commit(struct i915_request *request);
>  void __i915_request_queue(struct i915_request *rq,
>  			  const struct i915_sched_attr *attr);
> @@ -352,8 +355,6 @@ void i915_request_add(struct i915_request *rq);
>  bool __i915_request_submit(struct i915_request *request);
>  void i915_request_submit(struct i915_request *request);
>  
> -void i915_request_skip(struct i915_request *request, int error);
> -
>  void __i915_request_unsubmit(struct i915_request *request);
>  void i915_request_unsubmit(struct i915_request *request);
>  
> diff --git a/drivers/gpu/drm/i915/selftests/igt_spinner.c b/drivers/gpu/drm/i915/selftests/igt_spinner.c
> index e8a58fe49c39..9ad4ab088466 100644
> --- a/drivers/gpu/drm/i915/selftests/igt_spinner.c
> +++ b/drivers/gpu/drm/i915/selftests/igt_spinner.c
> @@ -183,7 +183,7 @@ igt_spinner_create_request(struct igt_spinner *spin,
>  
>  cancel_rq:
>  	if (err) {
> -		i915_request_skip(rq, err);
> +		i915_request_set_error_once(rq, err);
>  		i915_request_add(rq);
>  	}
>  unpin_hws:
> -- 
> 2.25.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx



[Index of Archives]     [AMD Graphics]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux