Following a reset, the context and page directory registers are lost. However, the queue of requests that we resubmit after the reset may depend upon them - the registers are restored from a context image, but that restore may be inhibited and may simply be absent from the request if it was in the middle of a sequence using the same context. If we prime the CCID/PD registers with the first request in the queue (even for the hung request), we prevent invalid memory access for the following requests (and continually hung engines). Fixes: 821ed7df6e2a ("drm/i915: Update reset path to fix incomplete requests") Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxx> --- This could do with going to stable but requires a few odds and ends, such as dma_fence_set_error(). Oh well, fortunately it is not as bad it might seem since these registers are restored from the context - but that then requires a mesa context to reset the GPU state (as fortunately we called MI_SET_CONTEXT at the start of every batch!), but any other request in the meantime will likely hang again. (Also I left gen8/ringbuffer reset_hw as an exercise for the reader) -Chris --- drivers/gpu/drm/i915/i915_gem.c | 18 +++++++----------- drivers/gpu/drm/i915/intel_lrc.c | 6 +++++- drivers/gpu/drm/i915/intel_ringbuffer.c | 30 +++++++++++++++++++++++++++--- 3 files changed, 39 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index fe32b1edbda2..7a16c859c875 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2735,21 +2735,17 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine) engine->irq_seqno_barrier(engine); request = i915_gem_find_active_request(engine); - if (!request) - return; - - if (!i915_gem_reset_request(request)) - return; + if (request && i915_gem_reset_request(request)) { + DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n", + engine->name, request->global_seqno); - DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n", - engine->name, request->global_seqno); + /* If this context is now banned, skip all pending requests. */ + if (i915_gem_context_is_banned(request->ctx)) + engine_skip_context(request); + } /* Setup the CS to resume from the breadcrumb of the hung request */ engine->reset_hw(engine, request); - - /* If this context is now banned, skip all of its pending requests. */ - if (i915_gem_context_is_banned(request->ctx)) - engine_skip_context(request); } void i915_gem_reset_finish(struct drm_i915_private *dev_priv) diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 44a92ea464ba..f44dd42e46b5 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -1324,7 +1324,10 @@ static void reset_common_ring(struct intel_engine_cs *engine, struct drm_i915_gem_request *request) { struct execlist_port *port = engine->execlist_port; - struct intel_context *ce = &request->ctx->engine[engine->id]; + struct intel_context *ce; + + if (!request || request->fence.error != -EIO) + return; /* We want a simple context + ring to execute the breadcrumb update. * We cannot rely on the context being intact across the GPU hang, @@ -1333,6 +1336,7 @@ static void reset_common_ring(struct intel_engine_cs *engine, * future request will be after userspace has had the opportunity * to recreate its own state. */ + ce = &request->ctx->engine[engine->id]; execlists_init_reg_state(ce->lrc_reg_state, request->ctx, engine, ce->ring); diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index d32cbba25d98..b30c1dc3061c 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -599,10 +599,34 @@ static int init_ring_common(struct intel_engine_cs *engine) static void reset_ring_common(struct intel_engine_cs *engine, struct drm_i915_gem_request *request) { - struct intel_ring *ring = request->ring; + if (request) { + struct drm_i915_private *dev_priv = request->i915; + struct intel_context *ce = &request->ctx->engine[engine->id]; + struct i915_hw_ppgtt *ppgtt; + + if (ce->state) { + I915_WRITE(CCID, + i915_ggtt_offset(ce->state) | + BIT(3) | BIT(2) | CCID_EN); + } - ring->head = request->postfix; - ring->last_retired_head = -1; + ppgtt = request->ctx->ppgtt ?: engine->i915->mm.aliasing_ppgtt; + if (ppgtt) { + u32 pd_offset = ppgtt->pd.base.ggtt_offset << 10; + + I915_WRITE(RING_PP_DIR_DCLV(engine), PP_DIR_DCLV_2G); + I915_WRITE(RING_PP_DIR_BASE(engine), pd_offset); + } + + if (request->fence.error == -EIO) { + struct intel_ring *ring = request->ring; + + ring->head = request->postfix; + ring->last_retired_head = -1; + } + } else { + engine->legacy_active_context = NULL; + } } static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req) -- 2.11.0 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx