Re: [PATCH 1/2] drm/i915/guc: Implement reset locally

Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> · Thu, 11 Apr 2019 15:57:00 +0300

Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes:

> Before causing guc and execlists to diverge further (breaking guc in the
> process), take a copy of the current reset procedure and make it local to
> the guc submission backend
>

I agree strongly on the sentiment. This is not the time and
area to try to hold on to the consolidation.

The reset dance has proven to be hard enough and
during trying to atone to the hw's peculiarities,
last thing we want is to step on eachothers toes.

Reviewed-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx>

> Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
> ---
>  drivers/gpu/drm/i915/intel_guc_submission.c | 102 ++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_lrc.c            |  37 ++++++-
>  drivers/gpu/drm/i915/intel_lrc.h            |   5 +
>  drivers/gpu/drm/i915/intel_ringbuffer.h     |   2 +-
>  4 files changed, 143 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
> index 42fcd622d7a3..6ebc125710ce 100644
> --- a/drivers/gpu/drm/i915/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/intel_guc_submission.c
> @@ -872,6 +872,104 @@ static void guc_reset_prepare(struct intel_engine_cs *engine)
>  		flush_workqueue(engine->i915->guc.preempt_wq);
>  }
>  
> +static void guc_reset(struct intel_engine_cs *engine, bool stalled)
> +{
> +	struct intel_engine_execlists * const execlists = &engine->execlists;
> +	struct i915_request *rq;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&engine->timeline.lock, flags);
> +
> +	execlists_cancel_port_requests(execlists);
> +
> +	/* Push back any incomplete requests for replay after the reset. */
> +	rq = execlists_unwind_incomplete_requests(execlists);
> +	if (!rq)
> +		goto out_unlock;
> +
> +	if (!i915_request_started(rq))
> +		stalled = false;
> +
> +	i915_reset_request(rq, stalled);
> +	intel_lr_context_reset(engine, rq->hw_context, rq->head, stalled);
> +
> +out_unlock:
> +	spin_unlock_irqrestore(&engine->timeline.lock, flags);
> +}
> +
> +static void guc_cancel_requests(struct intel_engine_cs *engine)
> +{
> +	struct intel_engine_execlists * const execlists = &engine->execlists;
> +	struct i915_request *rq, *rn;
> +	struct rb_node *rb;
> +	unsigned long flags;
> +
> +	GEM_TRACE("%s\n", engine->name);
> +
> +	/*
> +	 * Before we call engine->cancel_requests(), we should have exclusive
> +	 * access to the submission state. This is arranged for us by the
> +	 * caller disabling the interrupt generation, the tasklet and other
> +	 * threads that may then access the same state, giving us a free hand
> +	 * to reset state. However, we still need to let lockdep be aware that
> +	 * we know this state may be accessed in hardirq context, so we
> +	 * disable the irq around this manipulation and we want to keep
> +	 * the spinlock focused on its duties and not accidentally conflate
> +	 * coverage to the submission's irq state. (Similarly, although we
> +	 * shouldn't need to disable irq around the manipulation of the
> +	 * submission's irq state, we also wish to remind ourselves that
> +	 * it is irq state.)
> +	 */
> +	spin_lock_irqsave(&engine->timeline.lock, flags);
> +
> +	/* Cancel the requests on the HW and clear the ELSP tracker. */
> +	execlists_cancel_port_requests(execlists);
> +
> +	/* Mark all executing requests as skipped. */
> +	list_for_each_entry(rq, &engine->timeline.requests, link) {
> +		if (!i915_request_signaled(rq))
> +			dma_fence_set_error(&rq->fence, -EIO);
> +
> +		i915_request_mark_complete(rq);
> +	}
> +
> +	/* Flush the queued requests to the timeline list (for retiring). */
> +	while ((rb = rb_first_cached(&execlists->queue))) {
> +		struct i915_priolist *p = to_priolist(rb);
> +		int i;
> +
> +		priolist_for_each_request_consume(rq, rn, p, i) {
> +			list_del_init(&rq->sched.link);
> +			__i915_request_submit(rq);
> +			dma_fence_set_error(&rq->fence, -EIO);
> +			i915_request_mark_complete(rq);
> +		}
> +
> +		rb_erase_cached(&p->node, &execlists->queue);
> +		i915_priolist_free(p);
> +	}
> +
> +	/* Remaining _unready_ requests will be nop'ed when submitted */
> +
> +	execlists->queue_priority_hint = INT_MIN;
> +	execlists->queue = RB_ROOT_CACHED;
> +	GEM_BUG_ON(port_isset(execlists->port));
> +
> +	spin_unlock_irqrestore(&engine->timeline.lock, flags);
> +}
> +
> +static void guc_reset_finish(struct intel_engine_cs *engine)
> +{
> +	struct intel_engine_execlists * const execlists = &engine->execlists;
> +
> +	if (__tasklet_enable(&execlists->tasklet))
> +		/* And kick in case we missed a new request submission. */
> +		tasklet_hi_schedule(&execlists->tasklet);
> +
> +	GEM_TRACE("%s: depth->%d\n", engine->name,
> +		  atomic_read(&execlists->tasklet.count));
> +}
> +
>  /*
>   * Everything below here is concerned with setup & teardown, and is
>   * therefore not part of the somewhat time-critical batch-submission
> @@ -1293,6 +1391,10 @@ static void guc_set_default_submission(struct intel_engine_cs *engine)
>  	engine->unpark = guc_submission_unpark;
>  
>  	engine->reset.prepare = guc_reset_prepare;
> +	engine->reset.reset = guc_reset;
> +	engine->reset.finish = guc_reset_finish;
> +
> +	engine->cancel_requests = guc_cancel_requests;
>  
>  	engine->flags &= ~I915_ENGINE_SUPPORTS_STATS;
>  }
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 6931dbb2888c..f096bc7bbe35 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -429,13 +429,13 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine)
>  	return active;
>  }
>  
> -void
> +struct i915_request *
>  execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
>  {
>  	struct intel_engine_cs *engine =
>  		container_of(execlists, typeof(*engine), execlists);
>  
> -	__unwind_incomplete_requests(engine);
> +	return __unwind_incomplete_requests(engine);
>  }
>  
>  static inline void
> @@ -2328,6 +2328,8 @@ void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
>  	engine->execlists.tasklet.func = execlists_submission_tasklet;
>  
>  	engine->reset.prepare = execlists_reset_prepare;
> +	engine->reset.reset = execlists_reset;
> +	engine->reset.finish = execlists_reset_finish;
>  
>  	engine->park = NULL;
>  	engine->unpark = NULL;
> @@ -2980,6 +2982,37 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
>  	spin_unlock_irqrestore(&engine->timeline.lock, flags);
>  }
>  
> +void intel_lr_context_reset(struct intel_engine_cs *engine,
> +			    struct intel_context *ce,
> +			    u32 head,
> +			    bool scrub)
> +{
> +	/*
> +	 * We want a simple context + ring to execute the breadcrumb update.
> +	 * We cannot rely on the context being intact across the GPU hang,
> +	 * so clear it and rebuild just what we need for the breadcrumb.
> +	 * All pending requests for this context will be zapped, and any
> +	 * future request will be after userspace has had the opportunity
> +	 * to recreate its own state.
> +	 */
> +	if (scrub) {
> +		u32 *regs = ce->lrc_reg_state;
> +
> +		if (engine->pinned_default_state) {
> +			memcpy(regs, /* skip restoring the vanilla PPHWSP */
> +			       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
> +			       engine->context_size - PAGE_SIZE);
> +		}
> +		execlists_init_reg_state(regs, ce, engine, ce->ring);
> +	}
> +
> +	/* Rerun the request; its payload has been neutered (if guilty). */
> +	ce->ring->head = head;
> +	intel_ring_update_space(ce->ring);
> +
> +	__execlists_update_reg_state(ce, engine);
> +}
> +
>  #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
>  #include "selftests/intel_lrc.c"
>  #endif
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index 92642ab91472..5b20f1a9ea0f 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -105,6 +105,11 @@ struct i915_gem_context;
>  void intel_lr_context_resume(struct drm_i915_private *dev_priv);
>  void intel_execlists_set_default_submission(struct intel_engine_cs *engine);
>  
> +void intel_lr_context_reset(struct intel_engine_cs *engine,
> +			    struct intel_context *ce,
> +			    u32 head,
> +			    bool scrub);
> +
>  void intel_execlists_show_requests(struct intel_engine_cs *engine,
>  				   struct drm_printer *m,
>  				   void (*show_request)(struct drm_printer *m,
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index e58d6f04177b..bc1d7447a6cb 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -165,7 +165,7 @@ void execlists_user_end(struct intel_engine_execlists *execlists);
>  void
>  execlists_cancel_port_requests(struct intel_engine_execlists * const execlists);
>  
> -void
> +struct i915_request *
>  execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists);
>  
>  static inline unsigned int
> -- 
> 2.20.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx