If we haven't even begun executing the payload of the stalled request, then we should not claim that its userspace context was guilty of submitting a hanging batch. v2: Check for context corruption before trying to restart. v3: Preserve semaphores on skipping requests (need to keep the timelines intact). Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> --- drivers/gpu/drm/i915/intel_lrc.c | 42 +++++++++++++++++-- drivers/gpu/drm/i915/selftests/igt_spinner.c | 9 +++- .../gpu/drm/i915/selftests/intel_hangcheck.c | 6 +++ 3 files changed, 53 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 5e98fd79bd9d..e3134a635926 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -1387,6 +1387,10 @@ static int gen8_emit_init_breadcrumb(struct i915_request *rq) *cs++ = rq->fence.seqno - 1; intel_ring_advance(rq, cs); + + /* Record the updated position of the request's payload */ + rq->infix = intel_ring_offset(rq, cs); + return 0; } @@ -1878,6 +1882,23 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine) spin_unlock_irqrestore(&engine->timeline.lock, flags); } +static bool lrc_regs_ok(const struct i915_request *rq) +{ + const struct intel_ring *ring = rq->ring; + const u32 *regs = rq->hw_context->lrc_reg_state; + + /* Quick spot check for the common signs of context corruption */ + + if (regs[CTX_RING_BUFFER_CONTROL + 1] != + (RING_CTL_SIZE(ring->size) | RING_VALID)) + return false; + + if (regs[CTX_RING_BUFFER_START + 1] != i915_ggtt_offset(ring->vma)) + return false; + + return true; +} + static void execlists_reset(struct intel_engine_cs *engine, bool stalled) { struct intel_engine_execlists * const execlists = &engine->execlists; @@ -1912,6 +1933,21 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled) if (!rq) goto out_unlock; + /* + * If this request hasn't started yet, e.g. it is waiting on a + * semaphore, we need to avoid skipping the request or else we + * break the signaling chain. However, if the context is corrupt + * the request will not restart and we will be stuck with a wedged + * device. It is quite often the case that if we issue a reset + * while the GPU is loading the context image, that context image + * becomes corrupt. + * + * Otherwise, if we have not started yet, the request should replay + * perfectly and we do not need to flag the result as being erroneous. + */ + if (!i915_request_started(rq) && lrc_regs_ok(rq)) + goto out_unlock; + /* * If the request was innocent, we leave the request in the ELSP * and will try to replay it on restarting. The context image may @@ -1924,7 +1960,7 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled) * image back to the expected values to skip over the guilty request. */ i915_reset_request(rq, stalled); - if (!stalled) + if (!stalled && lrc_regs_ok(rq)) goto out_unlock; /* @@ -1942,8 +1978,8 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled) engine->context_size - PAGE_SIZE); } - /* Move the RING_HEAD onto the breadcrumb, past the hanging batch */ - rq->ring->head = intel_ring_wrap(rq->ring, rq->postfix); + /* Rerun the request; its payload has been neutered (if guilty). */ + rq->ring->head = intel_ring_wrap(rq->ring, rq->head); intel_ring_update_space(rq->ring); execlists_init_reg_state(regs, rq->gem_context, engine, rq->ring); diff --git a/drivers/gpu/drm/i915/selftests/igt_spinner.c b/drivers/gpu/drm/i915/selftests/igt_spinner.c index 9ebd9225684e..86354e51bdd3 100644 --- a/drivers/gpu/drm/i915/selftests/igt_spinner.c +++ b/drivers/gpu/drm/i915/selftests/igt_spinner.c @@ -142,10 +142,17 @@ igt_spinner_create_request(struct igt_spinner *spin, *batch++ = upper_32_bits(vma->node.start); *batch++ = MI_BATCH_BUFFER_END; /* not reached */ - i915_gem_chipset_flush(spin->i915); + if (engine->emit_init_breadcrumb && + rq->timeline->has_initial_breadcrumb) { + err = engine->emit_init_breadcrumb(rq); + if (err) + goto cancel_rq; + } err = engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, 0); + i915_gem_chipset_flush(spin->i915); + cancel_rq: if (err) { i915_request_skip(rq, err); diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c index 4886fac12628..36c17bfe05a7 100644 --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c @@ -246,6 +246,12 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine) if (INTEL_GEN(vm->i915) <= 5) flags |= I915_DISPATCH_SECURE; + if (rq->engine->emit_init_breadcrumb) { + err = rq->engine->emit_init_breadcrumb(rq); + if (err) + goto cancel_rq; + } + err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); cancel_rq: -- 2.20.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx