Re: [PATCH] drm/i915: Don't claim an unstarted request was guilty

Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> · Fri, 08 Feb 2019 16:47:13 +0200

Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes:

> If we haven't even begun executing the payload of the stalled request,
> then we should not claim that its userspace context was guilty of
> submitting a hanging batch.
>
> v2: Check for context corruption before trying to restart.
> v3: Preserve semaphores on skipping requests (need to keep the timelines
> intact).
>
> Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
> ---
>  drivers/gpu/drm/i915/intel_lrc.c              | 42 +++++++++++++++++--
>  drivers/gpu/drm/i915/selftests/igt_spinner.c  |  9 +++-
>  .../gpu/drm/i915/selftests/intel_hangcheck.c  |  6 +++
>  3 files changed, 53 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 5e98fd79bd9d..e3134a635926 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1387,6 +1387,10 @@ static int gen8_emit_init_breadcrumb(struct i915_request *rq)
>  	*cs++ = rq->fence.seqno - 1;
>  
>  	intel_ring_advance(rq, cs);
> +
> +	/* Record the updated position of the request's payload */
> +	rq->infix = intel_ring_offset(rq, cs);
> +
>  	return 0;
>  }
>  
> @@ -1878,6 +1882,23 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine)
>  	spin_unlock_irqrestore(&engine->timeline.lock, flags);
>  }
>  
> +static bool lrc_regs_ok(const struct i915_request *rq)
> +{
> +	const struct intel_ring *ring = rq->ring;
> +	const u32 *regs = rq->hw_context->lrc_reg_state;
> +
> +	/* Quick spot check for the common signs of context corruption */
> +
> +	if (regs[CTX_RING_BUFFER_CONTROL + 1] !=
> +	    (RING_CTL_SIZE(ring->size) | RING_VALID))
> +		return false;

You been noticing this with ctx corruption? Well now
thinking about it, we have had reports where on init,
on some trouble, the valid vanishes.

> +
> +	if (regs[CTX_RING_BUFFER_START + 1] != i915_ggtt_offset(ring->vma))
> +		return false;
> +

Seen this on bugzilla reports too. Are there more
in your sleeve or is this a compromise on complexity
and performance? Checking on a sane actual head too?

The heuristics of it bothers me some as we will
get false positives.

So in effect, when we get one, we just move ahead
after an extra reset as we got it all wrong?

> +	return true;
> +}
> +
>  static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
>  {
>  	struct intel_engine_execlists * const execlists = &engine->execlists;
> @@ -1912,6 +1933,21 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
>  	if (!rq)
>  		goto out_unlock;
>  
> +	/*
> +	 * If this request hasn't started yet, e.g. it is waiting on a
> +	 * semaphore, we need to avoid skipping the request or else we
> +	 * break the signaling chain. However, if the context is corrupt
> +	 * the request will not restart and we will be stuck with a wedged
> +	 * device. It is quite often the case that if we issue a reset
> +	 * while the GPU is loading the context image, that context image
> +	 * becomes corrupt.
> +	 *
> +	 * Otherwise, if we have not started yet, the request should replay
> +	 * perfectly and we do not need to flag the result as being erroneous.
> +	 */
> +	if (!i915_request_started(rq) && lrc_regs_ok(rq))
> +		goto out_unlock;
> +
>  	/*
>  	 * If the request was innocent, we leave the request in the ELSP
>  	 * and will try to replay it on restarting. The context image may
> @@ -1924,7 +1960,7 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
>  	 * image back to the expected values to skip over the guilty request.
>  	 */
>  	i915_reset_request(rq, stalled);
> -	if (!stalled)
> +	if (!stalled && lrc_regs_ok(rq))

Extend the ctx validator usage for on guilty engines, well why not.
-Mika

>  		goto out_unlock;
>  
>  	/*
> @@ -1942,8 +1978,8 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
>  		       engine->context_size - PAGE_SIZE);
>  	}
>  
> -	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
> -	rq->ring->head = intel_ring_wrap(rq->ring, rq->postfix);
> +	/* Rerun the request; its payload has been neutered (if guilty). */
> +	rq->ring->head = intel_ring_wrap(rq->ring, rq->head);
>  	intel_ring_update_space(rq->ring);
>  
>  	execlists_init_reg_state(regs, rq->gem_context, engine, rq->ring);
> diff --git a/drivers/gpu/drm/i915/selftests/igt_spinner.c b/drivers/gpu/drm/i915/selftests/igt_spinner.c
> index 9ebd9225684e..86354e51bdd3 100644
> --- a/drivers/gpu/drm/i915/selftests/igt_spinner.c
> +++ b/drivers/gpu/drm/i915/selftests/igt_spinner.c
> @@ -142,10 +142,17 @@ igt_spinner_create_request(struct igt_spinner *spin,
>  	*batch++ = upper_32_bits(vma->node.start);
>  	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
>  
> -	i915_gem_chipset_flush(spin->i915);
> +	if (engine->emit_init_breadcrumb &&
> +	    rq->timeline->has_initial_breadcrumb) {
> +		err = engine->emit_init_breadcrumb(rq);
> +		if (err)
> +			goto cancel_rq;
> +	}
>  
>  	err = engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, 0);
>  
> +	i915_gem_chipset_flush(spin->i915);
> +
>  cancel_rq:
>  	if (err) {
>  		i915_request_skip(rq, err);
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index 4886fac12628..36c17bfe05a7 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -246,6 +246,12 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine)
>  	if (INTEL_GEN(vm->i915) <= 5)
>  		flags |= I915_DISPATCH_SECURE;
>  
> +	if (rq->engine->emit_init_breadcrumb) {
> +		err = rq->engine->emit_init_breadcrumb(rq);
> +		if (err)
> +			goto cancel_rq;
> +	}
> +
>  	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
>  
>  cancel_rq:
> -- 
> 2.20.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx