On Tue, May 11, 2021 at 06:28:25PM +0200, Daniel Vetter wrote: > On Thu, May 06, 2021 at 12:14:28PM -0700, Matthew Brost wrote: > > We receive notification of an engine reset from GuC at its > > completion. Meaning GuC has potentially cleared any HW state > > we may have been interested in capturing. GuC resumes scheduling > > on the engine post-reset, as the resets are meant to be transparent, > > further muddling our error state. > > > > There is ongoing work to define an API for a GuC debug state dump. The > > suggestion for now is to manually disable FW initiated resets in cases > > where debug state is needed. > > > > Signed-off-by: Matthew Brost <matthew.brost@xxxxxxxxx> > > This looks a bit backwards to me: > Definitely a bit hacky but this patch does the best to capture the error as it can, > - I figured we should capture error state when we get the G2H, in which > case I hope we do know which the offending context was that got shot. > We know which context was shot based on the G2H. See 'hung_ce' in this patch. > - For now we're missing the hw state, but we should still be able to > capture the buffers userspace wants us to capture. So that could be > wired up already? Which buffers exactly? We dump all buffers associated with the context. > > But yeah register state capturing needs support from GuC fw. > > I think this is a big enough miss in GuC features that we should list it > on the rfc as a thing to fix. Agree this needs to be fixed. Matt > -Daniel > > > --- > > drivers/gpu/drm/i915/gt/intel_context.c | 20 +++++++++++ > > drivers/gpu/drm/i915/gt/intel_context.h | 3 ++ > > drivers/gpu/drm/i915/gt/intel_engine.h | 21 ++++++++++- > > drivers/gpu/drm/i915/gt/intel_engine_cs.c | 11 ++++-- > > drivers/gpu/drm/i915/gt/intel_engine_types.h | 2 ++ > > .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 35 +++++++++---------- > > drivers/gpu/drm/i915/i915_gpu_error.c | 25 ++++++++++--- > > 7 files changed, 91 insertions(+), 26 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c > > index 2f01437056a8..3fe7794b2bfd 100644 > > --- a/drivers/gpu/drm/i915/gt/intel_context.c > > +++ b/drivers/gpu/drm/i915/gt/intel_context.c > > @@ -514,6 +514,26 @@ struct i915_request *intel_context_create_request(struct intel_context *ce) > > return rq; > > } > > > > +struct i915_request *intel_context_find_active_request(struct intel_context *ce) > > +{ > > + struct i915_request *rq, *active = NULL; > > + unsigned long flags; > > + > > + GEM_BUG_ON(!intel_engine_uses_guc(ce->engine)); > > + > > + spin_lock_irqsave(&ce->guc_active.lock, flags); > > + list_for_each_entry_reverse(rq, &ce->guc_active.requests, > > + sched.link) { > > + if (i915_request_completed(rq)) > > + break; > > + > > + active = rq; > > + } > > + spin_unlock_irqrestore(&ce->guc_active.lock, flags); > > + > > + return active; > > +} > > + > > #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) > > #include "selftest_context.c" > > #endif > > diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h > > index 9b211ca5ecc7..d2b499ed8a05 100644 > > --- a/drivers/gpu/drm/i915/gt/intel_context.h > > +++ b/drivers/gpu/drm/i915/gt/intel_context.h > > @@ -195,6 +195,9 @@ int intel_context_prepare_remote_request(struct intel_context *ce, > > > > struct i915_request *intel_context_create_request(struct intel_context *ce); > > > > +struct i915_request * > > +intel_context_find_active_request(struct intel_context *ce); > > + > > static inline struct intel_ring *__intel_context_ring_size(u64 sz) > > { > > return u64_to_ptr(struct intel_ring, sz); > > diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h > > index 3321d0917a99..bb94963a9fa2 100644 > > --- a/drivers/gpu/drm/i915/gt/intel_engine.h > > +++ b/drivers/gpu/drm/i915/gt/intel_engine.h > > @@ -242,7 +242,7 @@ ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, > > ktime_t *now); > > > > struct i915_request * > > -intel_engine_find_active_request(struct intel_engine_cs *engine); > > +intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine); > > > > u32 intel_engine_context_size(struct intel_gt *gt, u8 class); > > > > @@ -316,4 +316,23 @@ intel_engine_get_sibling(struct intel_engine_cs *engine, unsigned int sibling) > > return engine->cops->get_sibling(engine, sibling); > > } > > > > +static inline void > > +intel_engine_set_hung_context(struct intel_engine_cs *engine, > > + struct intel_context *ce) > > +{ > > + engine->hung_ce = ce; > > +} > > + > > +static inline void > > +intel_engine_clear_hung_context(struct intel_engine_cs *engine) > > +{ > > + intel_engine_set_hung_context(engine, NULL); > > +} > > + > > +static inline struct intel_context * > > +intel_engine_get_hung_context(struct intel_engine_cs *engine) > > +{ > > + return engine->hung_ce; > > +} > > + > > #endif /* _INTEL_RINGBUFFER_H_ */ > > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c > > index 10300db1c9a6..ad3987289f09 100644 > > --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c > > +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c > > @@ -1727,7 +1727,7 @@ void intel_engine_dump(struct intel_engine_cs *engine, > > drm_printf(m, "\tRequests:\n"); > > > > spin_lock_irqsave(&engine->sched_engine->lock, flags); > > - rq = intel_engine_find_active_request(engine); > > + rq = intel_engine_execlist_find_hung_request(engine); > > if (rq) { > > struct intel_timeline *tl = get_timeline(rq); > > > > @@ -1838,10 +1838,17 @@ static bool match_ring(struct i915_request *rq) > > } > > > > struct i915_request * > > -intel_engine_find_active_request(struct intel_engine_cs *engine) > > +intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine) > > { > > struct i915_request *request, *active = NULL; > > > > + /* > > + * This search does not work in GuC submission mode. However, the GuC > > + * will report the hanging context directly to the driver itself. So > > + * the driver should never get here when in GuC mode. > > + */ > > + GEM_BUG_ON(intel_uc_uses_guc_submission(&engine->gt->uc)); > > + > > /* > > * We are called by the error capture, reset and to dump engine > > * state at random points in time. In particular, note that neither is > > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h > > index b84562b2708b..bba53e3b39b9 100644 > > --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h > > +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h > > @@ -304,6 +304,8 @@ struct intel_engine_cs { > > /* keep a request in reserve for a [pm] barrier under oom */ > > struct i915_request *request_pool; > > > > + struct intel_context *hung_ce; > > + > > struct llist_head barrier_tasks; > > > > struct intel_context *kernel_context; /* pinned */ > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c > > index 22f17a055b21..6b3b74e50b31 100644 > > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c > > @@ -726,24 +726,6 @@ __unwind_incomplete_requests(struct intel_context *ce) > > spin_unlock_irqrestore(&sched_engine->lock, flags); > > } > > > > -static struct i915_request *context_find_active_request(struct intel_context *ce) > > -{ > > - struct i915_request *rq, *active = NULL; > > - unsigned long flags; > > - > > - spin_lock_irqsave(&ce->guc_active.lock, flags); > > - list_for_each_entry_reverse(rq, &ce->guc_active.requests, > > - sched.link) { > > - if (i915_request_completed(rq)) > > - break; > > - > > - active = rq; > > - } > > - spin_unlock_irqrestore(&ce->guc_active.lock, flags); > > - > > - return active; > > -} > > - > > static void __guc_reset_context(struct intel_context *ce, bool stalled) > > { > > struct i915_request *rq; > > @@ -757,7 +739,7 @@ static void __guc_reset_context(struct intel_context *ce, bool stalled) > > */ > > clr_context_enabled(ce); > > > > - rq = context_find_active_request(ce); > > + rq = intel_context_find_active_request(ce); > > if (!rq) { > > head = ce->ring->tail; > > stalled = false; > > @@ -2192,6 +2174,20 @@ int intel_guc_sched_done_process_msg(struct intel_guc *guc, > > return 0; > > } > > > > +static void capture_error_state(struct intel_guc *guc, > > + struct intel_context *ce) > > +{ > > + struct intel_gt *gt = guc_to_gt(guc); > > + struct drm_i915_private *i915 = gt->i915; > > + struct intel_engine_cs *engine = __context_to_physical_engine(ce); > > + intel_wakeref_t wakeref; > > + > > + intel_engine_set_hung_context(engine, ce); > > + with_intel_runtime_pm(&i915->runtime_pm, wakeref) > > + i915_capture_error_state(gt, engine->mask); > > + atomic_inc(&i915->gpu_error.reset_engine_count[engine->uabi_class]); > > +} > > + > > static void guc_context_replay(struct intel_context *ce) > > { > > struct i915_sched_engine *sched_engine = ce->engine->sched_engine; > > @@ -2204,6 +2200,7 @@ static void guc_handle_context_reset(struct intel_guc *guc, > > struct intel_context *ce) > > { > > trace_intel_context_reset(ce); > > + capture_error_state(guc, ce); > > guc_context_replay(ce); > > } > > > > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c > > index 3352f56bcf63..825bdfe44225 100644 > > --- a/drivers/gpu/drm/i915/i915_gpu_error.c > > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c > > @@ -1435,20 +1435,37 @@ capture_engine(struct intel_engine_cs *engine, > > { > > struct intel_engine_capture_vma *capture = NULL; > > struct intel_engine_coredump *ee; > > - struct i915_request *rq; > > + struct intel_context *ce; > > + struct i915_request *rq = NULL; > > unsigned long flags; > > > > ee = intel_engine_coredump_alloc(engine, GFP_KERNEL); > > if (!ee) > > return NULL; > > > > - spin_lock_irqsave(&engine->sched_engine->lock, flags); > > - rq = intel_engine_find_active_request(engine); > > + ce = intel_engine_get_hung_context(engine); > > + if (ce) { > > + intel_engine_clear_hung_context(engine); > > + rq = intel_context_find_active_request(ce); > > + if (!rq || !i915_request_started(rq)) > > + goto no_request_capture; > > + } else { > > + /* > > + * Getting here with GuC enabled means it is a forced error capture > > + * with no actual hang. So, no need to attempt the execlist search. > > + */ > > + if (!intel_uc_uses_guc_submission(&engine->gt->uc)) { > > + spin_lock_irqsave(&engine->sched_engine->lock, flags); > > + rq = intel_engine_execlist_find_hung_request(engine); > > + spin_unlock_irqrestore(&engine->sched_engine->lock, > > + flags); > > + } > > + } > > if (rq) > > capture = intel_engine_coredump_add_request(ee, rq, > > ATOMIC_MAYFAIL); > > - spin_unlock_irqrestore(&engine->sched_engine->lock, flags); > > if (!capture) { > > +no_request_capture: > > kfree(ee); > > return NULL; > > } > > -- > > 2.28.0 > > > > _______________________________________________ > > Intel-gfx mailing list > > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > > https://lists.freedesktop.org/mailman/listinfo/intel-gfx > > -- > Daniel Vetter > Software Engineer, Intel Corporation > http://blog.ffwll.ch