Re: [PATCH v4] drm/i915: Restore context and pd for ringbuffer submission after reset

Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> · Tue, 07 Feb 2017 17:54:10 +0200

Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes:

> Following a reset, the context and page directory registers are lost.
> However, the queue of requests that we resubmit after the reset may
> depend upon them - the registers are restored from a context image, but
> that restore may be inhibited and may simply be absent from the request
> if it was in the middle of a sequence using the same context. If we
> prime the CCID/PD registers with the first request in the queue (even
> for the hung request), we prevent invalid memory access for the
> following requests (and continually hung engines).
>
> v2: Magic BIT(8), reserved for future use but still appears unused.
> v3: Some commentary on handling innocent vs guilty requests
> v4: Add a wait for PD_BASE fetch. The reload appears to be instant on my
> Ivybridge, but this bit probably exists for a reason.
>
> Fixes: 821ed7df6e2a ("drm/i915: Update reset path to fix incomplete requests")
> Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
> Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxx>

Reviewed-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxx>

> ---
>  drivers/gpu/drm/i915/i915_gem.c         | 18 ++++------
>  drivers/gpu/drm/i915/i915_reg.h         |  6 ++--
>  drivers/gpu/drm/i915/intel_lrc.c        | 16 ++++++++-
>  drivers/gpu/drm/i915/intel_ringbuffer.c | 58 +++++++++++++++++++++++++++++++--
>  4 files changed, 81 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 8a510c7f6828..b7632bbbafd8 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2743,21 +2743,17 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine)
>  		engine->irq_seqno_barrier(engine);
>  
>  	request = i915_gem_find_active_request(engine);
> -	if (!request)
> -		return;
> -
> -	if (!i915_gem_reset_request(request))
> -		return;
> +	if (request && i915_gem_reset_request(request)) {
> +		DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
> +				 engine->name, request->global_seqno);
>  
> -	DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
> -			 engine->name, request->global_seqno);
> +		/* If this context is now banned, skip all pending requests. */
> +		if (i915_gem_context_is_banned(request->ctx))
> +			engine_skip_context(request);
> +	}
>  
>  	/* Setup the CS to resume from the breadcrumb of the hung request */
>  	engine->reset_hw(engine, request);
> -
> -	/* If this context is now banned, skip all of its pending requests. */
> -	if (i915_gem_context_is_banned(request->ctx))
> -		engine_skip_context(request);
>  }
>  
>  void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index 5bf36bdc5926..4a59091c0152 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -3323,8 +3323,10 @@ enum skl_disp_power_wells {
>  /*
>   * Logical Context regs
>   */
> -#define CCID			_MMIO(0x2180)
> -#define   CCID_EN		(1<<0)
> +#define CCID				_MMIO(0x2180)
> +#define   CCID_EN			BIT(0)
> +#define   CCID_EXTENDED_STATE_RESTORE	BIT(2)
> +#define   CCID_EXTENDED_STATE_SAVE	BIT(3)
>  /*
>   * Notes on SNB/IVB/VLV context size:
>   * - Power context is saved elsewhere (LLC or stolen)
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index df8e6f74dc7e..e42990b56fa8 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1352,7 +1352,20 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  			      struct drm_i915_gem_request *request)
>  {
>  	struct execlist_port *port = engine->execlist_port;
> -	struct intel_context *ce = &request->ctx->engine[engine->id];
> +	struct intel_context *ce;
> +
> +	/* If the request was innocent, we leave the request in the ELSP
> +	 * and will try to replay it on restarting. The context image may
> +	 * have been corrupted by the reset, in which case we may have
> +	 * to service a new GPU hang, but more likely we can continue on
> +	 * without impact.
> +	 *
> +	 * If the request was guilty, we presume the context is corrupt
> +	 * and have to at least restore the RING register in the context
> +	 * image back to the expected values to skip over the guilty request.
> +	 */
> +	if (!request || request->fence.error != -EIO)
> +		return;
>  
>  	/* We want a simple context + ring to execute the breadcrumb update.
>  	 * We cannot rely on the context being intact across the GPU hang,
> @@ -1361,6 +1374,7 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	 * future request will be after userspace has had the opportunity
>  	 * to recreate its own state.
>  	 */
> +	ce = &request->ctx->engine[engine->id];
>  	execlists_init_reg_state(ce->lrc_reg_state,
>  				 request->ctx, engine, ce->ring);
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 383083ef2210..d3d1e64f2498 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -599,10 +599,62 @@ static int init_ring_common(struct intel_engine_cs *engine)
>  static void reset_ring_common(struct intel_engine_cs *engine,
>  			      struct drm_i915_gem_request *request)
>  {
> -	struct intel_ring *ring = request->ring;
> +	/* Try to restore the logical GPU state to match the continuation
> +	 * of the request queue. If we skip the context/PD restore, then
> +	 * the next request may try to execute assuming that its context
> +	 * is valid and loaded on the GPU and so may try to access invalid
> +	 * memory, prompting repeated GPU hangs.
> +	 *
> +	 * If the request was guilty, we still restore the logical state
> +	 * in case the next request requires it (e.g. the aliasing ppgtt),
> +	 * but skip over the hung batch.
> +	 *
> +	 * If the request was innocent, we try to replay the request with
> +	 * the restored context.
> +	 */
> +	if (request) {
> +		struct drm_i915_private *dev_priv = request->i915;
> +		struct intel_context *ce = &request->ctx->engine[engine->id];
> +		struct i915_hw_ppgtt *ppgtt;
> +
> +		/* FIXME consider gen8 reset */
> +
> +		if (ce->state) {
> +			I915_WRITE(CCID,
> +				   i915_ggtt_offset(ce->state) |
> +				   BIT(8) /* must be set! */ |
> +				   CCID_EXTENDED_STATE_SAVE |
> +				   CCID_EXTENDED_STATE_RESTORE |
> +				   CCID_EN);
> +		}
>  
> -	ring->head = request->postfix;
> -	ring->last_retired_head = -1;
> +		ppgtt = request->ctx->ppgtt ?: engine->i915->mm.aliasing_ppgtt;
> +		if (ppgtt) {
> +			u32 pd_offset = ppgtt->pd.base.ggtt_offset << 10;
> +
> +			I915_WRITE(RING_PP_DIR_DCLV(engine), PP_DIR_DCLV_2G);
> +			I915_WRITE(RING_PP_DIR_BASE(engine), pd_offset);
> +
> +			/* Wait for the PD reload to complete */
> +			if (intel_wait_for_register(dev_priv,
> +						    RING_PP_DIR_BASE(engine),
> +						    BIT(0), 0,
> +						    10))
> +				DRM_ERROR("Wait for reload of ppgtt page-directory timed out\n");
> +
> +			ppgtt->pd_dirty_rings &= ~intel_engine_flag(engine);
> +		}
> +
> +		/* If the rq hung, jump to its breadcrumb and skip the batch */
> +		if (request->fence.error == -EIO) {
> +			struct intel_ring *ring = request->ring;
> +
> +			ring->head = request->postfix;
> +			ring->last_retired_head = -1;
> +		}
> +	} else {
> +		engine->legacy_active_context = NULL;
> +	}
>  }
>  
>  static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req)
> -- 
> 2.11.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx