Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes: > Following a reset, the context and page directory registers are lost. > However, the queue of requests that we resubmit after the reset may > depend upon them - the registers are restored from a context image, but > that restore may be inhibited and may simply be absent from the request > if it was in the middle of a sequence using the same context. If we > prime the CCID/PD registers with the first request in the queue (even > for the hung request), we prevent invalid memory access for the > following requests (and continually hung engines). > > v2: Magic BIT(8), reserved for future use but still appears unused. > v3: Some commentary on handling innocent vs guilty requests > v4: Add a wait for PD_BASE fetch. The reload appears to be instant on my > Ivybridge, but this bit probably exists for a reason. > > Fixes: 821ed7df6e2a ("drm/i915: Update reset path to fix incomplete requests") > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxx> Reviewed-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_gem.c | 18 ++++------ > drivers/gpu/drm/i915/i915_reg.h | 6 ++-- > drivers/gpu/drm/i915/intel_lrc.c | 16 ++++++++- > drivers/gpu/drm/i915/intel_ringbuffer.c | 58 +++++++++++++++++++++++++++++++-- > 4 files changed, 81 insertions(+), 17 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index 8a510c7f6828..b7632bbbafd8 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -2743,21 +2743,17 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine) > engine->irq_seqno_barrier(engine); > > request = i915_gem_find_active_request(engine); > - if (!request) > - return; > - > - if (!i915_gem_reset_request(request)) > - return; > + if (request && i915_gem_reset_request(request)) { > + DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n", > + engine->name, request->global_seqno); > > - DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n", > - engine->name, request->global_seqno); > + /* If this context is now banned, skip all pending requests. */ > + if (i915_gem_context_is_banned(request->ctx)) > + engine_skip_context(request); > + } > > /* Setup the CS to resume from the breadcrumb of the hung request */ > engine->reset_hw(engine, request); > - > - /* If this context is now banned, skip all of its pending requests. */ > - if (i915_gem_context_is_banned(request->ctx)) > - engine_skip_context(request); > } > > void i915_gem_reset_finish(struct drm_i915_private *dev_priv) > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h > index 5bf36bdc5926..4a59091c0152 100644 > --- a/drivers/gpu/drm/i915/i915_reg.h > +++ b/drivers/gpu/drm/i915/i915_reg.h > @@ -3323,8 +3323,10 @@ enum skl_disp_power_wells { > /* > * Logical Context regs > */ > -#define CCID _MMIO(0x2180) > -#define CCID_EN (1<<0) > +#define CCID _MMIO(0x2180) > +#define CCID_EN BIT(0) > +#define CCID_EXTENDED_STATE_RESTORE BIT(2) > +#define CCID_EXTENDED_STATE_SAVE BIT(3) > /* > * Notes on SNB/IVB/VLV context size: > * - Power context is saved elsewhere (LLC or stolen) > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c > index df8e6f74dc7e..e42990b56fa8 100644 > --- a/drivers/gpu/drm/i915/intel_lrc.c > +++ b/drivers/gpu/drm/i915/intel_lrc.c > @@ -1352,7 +1352,20 @@ static void reset_common_ring(struct intel_engine_cs *engine, > struct drm_i915_gem_request *request) > { > struct execlist_port *port = engine->execlist_port; > - struct intel_context *ce = &request->ctx->engine[engine->id]; > + struct intel_context *ce; > + > + /* If the request was innocent, we leave the request in the ELSP > + * and will try to replay it on restarting. The context image may > + * have been corrupted by the reset, in which case we may have > + * to service a new GPU hang, but more likely we can continue on > + * without impact. > + * > + * If the request was guilty, we presume the context is corrupt > + * and have to at least restore the RING register in the context > + * image back to the expected values to skip over the guilty request. > + */ > + if (!request || request->fence.error != -EIO) > + return; > > /* We want a simple context + ring to execute the breadcrumb update. > * We cannot rely on the context being intact across the GPU hang, > @@ -1361,6 +1374,7 @@ static void reset_common_ring(struct intel_engine_cs *engine, > * future request will be after userspace has had the opportunity > * to recreate its own state. > */ > + ce = &request->ctx->engine[engine->id]; > execlists_init_reg_state(ce->lrc_reg_state, > request->ctx, engine, ce->ring); > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c > index 383083ef2210..d3d1e64f2498 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > @@ -599,10 +599,62 @@ static int init_ring_common(struct intel_engine_cs *engine) > static void reset_ring_common(struct intel_engine_cs *engine, > struct drm_i915_gem_request *request) > { > - struct intel_ring *ring = request->ring; > + /* Try to restore the logical GPU state to match the continuation > + * of the request queue. If we skip the context/PD restore, then > + * the next request may try to execute assuming that its context > + * is valid and loaded on the GPU and so may try to access invalid > + * memory, prompting repeated GPU hangs. > + * > + * If the request was guilty, we still restore the logical state > + * in case the next request requires it (e.g. the aliasing ppgtt), > + * but skip over the hung batch. > + * > + * If the request was innocent, we try to replay the request with > + * the restored context. > + */ > + if (request) { > + struct drm_i915_private *dev_priv = request->i915; > + struct intel_context *ce = &request->ctx->engine[engine->id]; > + struct i915_hw_ppgtt *ppgtt; > + > + /* FIXME consider gen8 reset */ > + > + if (ce->state) { > + I915_WRITE(CCID, > + i915_ggtt_offset(ce->state) | > + BIT(8) /* must be set! */ | > + CCID_EXTENDED_STATE_SAVE | > + CCID_EXTENDED_STATE_RESTORE | > + CCID_EN); > + } > > - ring->head = request->postfix; > - ring->last_retired_head = -1; > + ppgtt = request->ctx->ppgtt ?: engine->i915->mm.aliasing_ppgtt; > + if (ppgtt) { > + u32 pd_offset = ppgtt->pd.base.ggtt_offset << 10; > + > + I915_WRITE(RING_PP_DIR_DCLV(engine), PP_DIR_DCLV_2G); > + I915_WRITE(RING_PP_DIR_BASE(engine), pd_offset); > + > + /* Wait for the PD reload to complete */ > + if (intel_wait_for_register(dev_priv, > + RING_PP_DIR_BASE(engine), > + BIT(0), 0, > + 10)) > + DRM_ERROR("Wait for reload of ppgtt page-directory timed out\n"); > + > + ppgtt->pd_dirty_rings &= ~intel_engine_flag(engine); > + } > + > + /* If the rq hung, jump to its breadcrumb and skip the batch */ > + if (request->fence.error == -EIO) { > + struct intel_ring *ring = request->ring; > + > + ring->head = request->postfix; > + ring->last_retired_head = -1; > + } > + } else { > + engine->legacy_active_context = NULL; > + } > } > > static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req) > -- > 2.11.0 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/intel-gfx _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx