On Thu, Aug 26, 2021 at 12:27:31PM +0100, Tvrtko Ursulin wrote: > > On 26/08/2021 04:49, Matthew Brost wrote: > > On Wed, Aug 25, 2021 at 11:39:10AM +0100, Tvrtko Ursulin wrote: > > > > > > On 27/07/2021 01:23, Matthew Brost wrote: > > > > When using GuC submission, if a context gets banned disable scheduling > > > > and mark all inflight requests as complete. > > > > > > > > Cc: John Harrison <John.C.Harrison@xxxxxxxxx> > > > > Signed-off-by: Matthew Brost <matthew.brost@xxxxxxxxx> > > > > Reviewed-by: John Harrison <John.C.Harrison@xxxxxxxxx> > > > > --- > > > > drivers/gpu/drm/i915/gem/i915_gem_context.c | 2 +- > > > > drivers/gpu/drm/i915/gt/intel_context.h | 13 ++ > > > > drivers/gpu/drm/i915/gt/intel_context_types.h | 2 + > > > > drivers/gpu/drm/i915/gt/intel_reset.c | 32 +--- > > > > .../gpu/drm/i915/gt/intel_ring_submission.c | 20 +++ > > > > drivers/gpu/drm/i915/gt/uc/intel_guc.h | 2 + > > > > .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 151 ++++++++++++++++-- > > > > drivers/gpu/drm/i915/i915_trace.h | 10 ++ > > > > 8 files changed, 195 insertions(+), 37 deletions(-) > > > > > > > > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c > > > > index e3df01a201d7..05c3ee191710 100644 > > > > --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c > > > > +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c > > > > @@ -1084,7 +1084,7 @@ static void kill_engines(struct i915_gem_engines *engines, bool ban) > > > > for_each_gem_engine(ce, engines, it) { > > > > struct intel_engine_cs *engine; > > > > - if (ban && intel_context_set_banned(ce)) > > > > + if (ban && intel_context_ban(ce, NULL)) > > > > continue; > > > > /* > > > > diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h > > > > index 2ed9bf5f91a5..814d9277096a 100644 > > > > --- a/drivers/gpu/drm/i915/gt/intel_context.h > > > > +++ b/drivers/gpu/drm/i915/gt/intel_context.h > > > > @@ -16,6 +16,7 @@ > > > > #include "intel_engine_types.h" > > > > #include "intel_ring_types.h" > > > > #include "intel_timeline_types.h" > > > > +#include "i915_trace.h" > > > > #define CE_TRACE(ce, fmt, ...) do { \ > > > > const struct intel_context *ce__ = (ce); \ > > > > @@ -243,6 +244,18 @@ static inline bool intel_context_set_banned(struct intel_context *ce) > > > > return test_and_set_bit(CONTEXT_BANNED, &ce->flags); > > > > } > > > > +static inline bool intel_context_ban(struct intel_context *ce, > > > > + struct i915_request *rq) > > > > +{ > > > > + bool ret = intel_context_set_banned(ce); > > > > + > > > > + trace_intel_context_ban(ce); > > > > + if (ce->ops->ban) > > > > + ce->ops->ban(ce, rq); > > > > + > > > > + return ret; > > > > +} > > > > + > > > > static inline bool > > > > intel_context_force_single_submission(const struct intel_context *ce) > > > > { > > > > diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h > > > > index 035108c10b2c..57c19ee3e313 100644 > > > > --- a/drivers/gpu/drm/i915/gt/intel_context_types.h > > > > +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h > > > > @@ -35,6 +35,8 @@ struct intel_context_ops { > > > > int (*alloc)(struct intel_context *ce); > > > > + void (*ban)(struct intel_context *ce, struct i915_request *rq); > > > > + > > > > int (*pre_pin)(struct intel_context *ce, struct i915_gem_ww_ctx *ww, void **vaddr); > > > > int (*pin)(struct intel_context *ce, void *vaddr); > > > > void (*unpin)(struct intel_context *ce); > > > > diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c > > > > index 4d281bc8a38c..91200c43951f 100644 > > > > --- a/drivers/gpu/drm/i915/gt/intel_reset.c > > > > +++ b/drivers/gpu/drm/i915/gt/intel_reset.c > > > > @@ -22,7 +22,6 @@ > > > > #include "intel_reset.h" > > > > #include "uc/intel_guc.h" > > > > -#include "uc/intel_guc_submission.h" > > > > #define RESET_MAX_RETRIES 3 > > > > @@ -39,21 +38,6 @@ static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr) > > > > intel_uncore_rmw_fw(uncore, reg, clr, 0); > > > > } > > > > -static void skip_context(struct i915_request *rq) > > > > -{ > > > > - struct intel_context *hung_ctx = rq->context; > > > > - > > > > - list_for_each_entry_from_rcu(rq, &hung_ctx->timeline->requests, link) { > > > > - if (!i915_request_is_active(rq)) > > > > - return; > > > > - > > > > - if (rq->context == hung_ctx) { > > > > - i915_request_set_error_once(rq, -EIO); > > > > - __i915_request_skip(rq); > > > > - } > > > > - } > > > > -} > > > > - > > > > static void client_mark_guilty(struct i915_gem_context *ctx, bool banned) > > > > { > > > > struct drm_i915_file_private *file_priv = ctx->file_priv; > > > > @@ -88,10 +72,8 @@ static bool mark_guilty(struct i915_request *rq) > > > > bool banned; > > > > int i; > > > > - if (intel_context_is_closed(rq->context)) { > > > > - intel_context_set_banned(rq->context); > > > > + if (intel_context_is_closed(rq->context)) > > > > return true; > > > > - } > > > > rcu_read_lock(); > > > > ctx = rcu_dereference(rq->context->gem_context); > > > > @@ -123,11 +105,9 @@ static bool mark_guilty(struct i915_request *rq) > > > > banned = !i915_gem_context_is_recoverable(ctx); > > > > if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES)) > > > > banned = true; > > > > - if (banned) { > > > > + if (banned) > > > > drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n", > > > > ctx->name, atomic_read(&ctx->guilty_count)); > > > > - intel_context_set_banned(rq->context); > > > > - } > > > > client_mark_guilty(ctx, banned); > > > > @@ -149,6 +129,8 @@ static void mark_innocent(struct i915_request *rq) > > > > void __i915_request_reset(struct i915_request *rq, bool guilty) > > > > { > > > > + bool banned = false; > > > > + > > > > RQ_TRACE(rq, "guilty? %s\n", yesno(guilty)); > > > > GEM_BUG_ON(__i915_request_is_complete(rq)); > > > > @@ -156,13 +138,15 @@ void __i915_request_reset(struct i915_request *rq, bool guilty) > > > > if (guilty) { > > > > i915_request_set_error_once(rq, -EIO); > > > > __i915_request_skip(rq); > > > > - if (mark_guilty(rq) && !intel_engine_uses_guc(rq->engine)) > > > > - skip_context(rq); > > > > + banned = mark_guilty(rq); > > > > } else { > > > > i915_request_set_error_once(rq, -EAGAIN); > > > > mark_innocent(rq); > > > > } > > > > rcu_read_unlock(); > > > > + > > > > + if (banned) > > > > + intel_context_ban(rq->context, rq); > > > > } > > > > static bool i915_in_reset(struct pci_dev *pdev) > > > > diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c > > > > index a5404c7b600f..05bb9f449df1 100644 > > > > --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c > > > > +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c > > > > @@ -586,9 +586,29 @@ static void ring_context_reset(struct intel_context *ce) > > > > clear_bit(CONTEXT_VALID_BIT, &ce->flags); > > > > } > > > > +static void ring_context_ban(struct intel_context *ce, > > > > + struct i915_request *rq) > > > > +{ > > > > + struct intel_engine_cs *engine; > > > > + > > > > + if (!rq || !i915_request_is_active(rq)) > > > > + return; > > > When this gets called from context close, via intel_context_ban, rq will be > > > always NULL - so the below loop which skips a line of executing requests > > > never gets to run. > > > > > > This appears a functional change versus previous state of the codebase, > > > where skip_context() would run in any case. > > > > > > > Hmm, not so sure about this being a functional change. When called from > > __i915_request_reset, intel_context_ban (previously skip_context there) > > always has a non-NULL request argument. When called from kill_engines > > the request is NULL (short circuiting ring function), but we never > > hooked into the ring backend before. > > Yeah that's the area of confusion. You have defined the interface as > "intel_context_ban(ce, rq)" and it is called two times during a common flow. > It is true there is no functional change, but it is confusing what the > purpose of intel_context_ban then is, given how absence of the rq parameter > on the context close path makes it do almost nothing (apart in the case of > GuC), plus on top, the flow actually relies on it being called 2nd time from > deeper in the stack in case of ringbuf (other backend do not care). > This was an attempt to pull backend specific behavior (i.e skip_context prior to my patch) into the backend. > > > > > I had this observation from a patch I am working on (https://intel-gfx-ci.01.org/tree/drm-tip/Trybot_7950/shard-snb6/igt@gem_ctx_exec@xxxxxxxxxxxxxxxxxxxxxx > > > - so a long line of executing requests which did not get zapped post reset), > > > but may be wrong. Maybe I am missing something since I don't yet understand > > > why would I be first to hit this issue. So take it with a grain of salt for > > > now. > > > > > > > CI was green on my series but CI doesn't always catch everything... > > > > Do you have link to your series that I can look at? > > It was on trybot, thought you'll figure it out, but now I have sent it to > intel-gfx as well so please have a look. To be clear I am not too happy with > the current state of that patch.. > I found it, almost certainly your problem is short circuiting it on the state of the ban bit. > > > > > Ah.. maybe the key is that in my patch I made intel_context_ban not call > > > ce->ops->"ban" (I renamed it to revoke) unconditionally. Hence there may be > > > a path there intel_context_ban is first called wo/ a rq, then from within > > > __i915_request_reset it gets called with rq, which now fails to call the > > > vfunc. Hm that's clunky and fragile which ever way I look at it. I'll trybot > > > one more experiment.. > > > > > > > I'm open to suggestions on how to change this, to make it makes sense to > > call into the function unconditionally with a correct arguments and the > > backend handles the rest (i.e. the way it is currently). > > .. given what I wrote in the first paragraph. And I presently have no > suggestions how to improve it, since it seems complicated enough to require > quite a bit of thinking. > > The issues as I see them now, intel_context_ban, when called from the > context close path: > > Guc: > * Does not use the rq parameter even if it was available. > * Deals with the list of queued requests. > (Although why does guc_cancel_context_requests isn't doing any skipping, > contrary to the comment inside?) > It calls i915_request_mark_eio which skips a request without putting in back on the HW. The context is banned thus we can't / shouldn't put that context on the HW ever again. > Execlists: > * Does not use the vfunc at all, remains to be just a set_bit call. > Chris pulled the execlists vfunc out but isn't the point of having a vfunc, it can be present or not. > Ringbuffer: > * Just sets the bit. > > When called from the reset path: > > Guc: > * Same as on context close path - redundant? > Not redundant, in both cases we need to ban context in the GuC. Scheduling is likely disabled in this path so we can cancel the requests straight away. > Execlists: > * Nothing, sets the same already set bit potentially. > > Ringbuf: > * Deals with cancelling queued requests - like GuC does in the 1st > invocation already. > * Does not use appear to use the rq argument meanigfully. Isn't the "is not > active" check redundant to sched_engine->requests only having active request > on the list? Not sure. This is why added the request argument, perhaps it could be removed? It being present / NULL also has a meaning (post-reset, pre-reset). > > Most obvious open is whether the rq paramenter is even needed. > Maybe, see above. > Then is the redundant call to the same func in case of GuC needed? If not, > which one should remain - first or the second call? > In the case of the GuC we need both calls. Matt > Regards, > > Tvrtko > > > Matt > > > > > Regards, > > > > > > Tvrtko > > > > > > > + > > > > + engine = rq->engine; > > > > + lockdep_assert_held(&engine->sched_engine->lock); > > > > + list_for_each_entry_continue(rq, &engine->sched_engine->requests, > > > > + sched.link) > > > > + if (rq->context == ce) { > > > > + i915_request_set_error_once(rq, -EIO); > > > > + __i915_request_skip(rq); > > > > + } > > > > +} > > > > + > > > > static const struct intel_context_ops ring_context_ops = { > > > > .alloc = ring_context_alloc, > > > > + .ban = ring_context_ban, > > > > + > > > > .pre_pin = ring_context_pre_pin, > > > > .pin = ring_context_pin, > > > > .unpin = ring_context_unpin, > > > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h > > > > index 1875303c3bca..8ab70a2223b0 100644 > > > > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h > > > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h > > > > @@ -281,6 +281,8 @@ void intel_guc_find_hung_context(struct intel_engine_cs *engine); > > > > int intel_guc_global_policies_update(struct intel_guc *guc); > > > > +void intel_guc_context_ban(struct intel_context *ce, struct i915_request *rq); > > > > + > > > > void intel_guc_submission_reset_prepare(struct intel_guc *guc); > > > > void intel_guc_submission_reset(struct intel_guc *guc, bool stalled); > > > > void intel_guc_submission_reset_finish(struct intel_guc *guc); > > > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c > > > > index cea3e3073a71..ad9a38a861df 100644 > > > > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c > > > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c > > > > @@ -125,6 +125,7 @@ static inline void clr_context_pending_enable(struct intel_context *ce) > > > > #define SCHED_STATE_WAIT_FOR_DEREGISTER_TO_REGISTER BIT(0) > > > > #define SCHED_STATE_DESTROYED BIT(1) > > > > #define SCHED_STATE_PENDING_DISABLE BIT(2) > > > > +#define SCHED_STATE_BANNED BIT(3) > > > > static inline void init_sched_state(struct intel_context *ce) > > > > { > > > > /* Only should be called from guc_lrc_desc_pin() */ > > > > @@ -185,6 +186,23 @@ static inline void clr_context_pending_disable(struct intel_context *ce) > > > > ce->guc_state.sched_state &= ~SCHED_STATE_PENDING_DISABLE; > > > > } > > > > +static inline bool context_banned(struct intel_context *ce) > > > > +{ > > > > + return ce->guc_state.sched_state & SCHED_STATE_BANNED; > > > > +} > > > > + > > > > +static inline void set_context_banned(struct intel_context *ce) > > > > +{ > > > > + lockdep_assert_held(&ce->guc_state.lock); > > > > + ce->guc_state.sched_state |= SCHED_STATE_BANNED; > > > > +} > > > > + > > > > +static inline void clr_context_banned(struct intel_context *ce) > > > > +{ > > > > + lockdep_assert_held(&ce->guc_state.lock); > > > > + ce->guc_state.sched_state &= ~SCHED_STATE_BANNED; > > > > +} > > > > + > > > > static inline bool context_guc_id_invalid(struct intel_context *ce) > > > > { > > > > return ce->guc_id == GUC_INVALID_LRC_ID; > > > > @@ -357,13 +375,23 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop); > > > > static int guc_add_request(struct intel_guc *guc, struct i915_request *rq) > > > > { > > > > - int err; > > > > + int err = 0; > > > > struct intel_context *ce = rq->context; > > > > u32 action[3]; > > > > int len = 0; > > > > u32 g2h_len_dw = 0; > > > > bool enabled; > > > > + /* > > > > + * Corner case where requests were sitting in the priority list or a > > > > + * request resubmitted after the context was banned. > > > > + */ > > > > + if (unlikely(intel_context_is_banned(ce))) { > > > > + i915_request_put(i915_request_mark_eio(rq)); > > > > + intel_engine_signal_breadcrumbs(ce->engine); > > > > + goto out; > > > > + } > > > > + > > > > GEM_BUG_ON(!atomic_read(&ce->guc_id_ref)); > > > > GEM_BUG_ON(context_guc_id_invalid(ce)); > > > > @@ -399,6 +427,8 @@ static int guc_add_request(struct intel_guc *guc, struct i915_request *rq) > > > > clr_context_pending_enable(ce); > > > > intel_context_put(ce); > > > > } > > > > + if (likely(!err)) > > > > + trace_i915_request_guc_submit(rq); > > > > out: > > > > return err; > > > > @@ -463,7 +493,6 @@ static int guc_dequeue_one_context(struct intel_guc *guc) > > > > guc->stalled_request = last; > > > > return false; > > > > } > > > > - trace_i915_request_guc_submit(last); > > > > } > > > > guc->stalled_request = NULL; > > > > @@ -502,12 +531,13 @@ static void cs_irq_handler(struct intel_engine_cs *engine, u16 iir) > > > > static void __guc_context_destroy(struct intel_context *ce); > > > > static void release_guc_id(struct intel_guc *guc, struct intel_context *ce); > > > > static void guc_signal_context_fence(struct intel_context *ce); > > > > +static void guc_cancel_context_requests(struct intel_context *ce); > > > > static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc) > > > > { > > > > struct intel_context *ce; > > > > unsigned long index, flags; > > > > - bool pending_disable, pending_enable, deregister, destroyed; > > > > + bool pending_disable, pending_enable, deregister, destroyed, banned; > > > > xa_for_each(&guc->context_lookup, index, ce) { > > > > /* Flush context */ > > > > @@ -525,6 +555,7 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc) > > > > pending_enable = context_pending_enable(ce); > > > > pending_disable = context_pending_disable(ce); > > > > deregister = context_wait_for_deregister_to_register(ce); > > > > + banned = context_banned(ce); > > > > init_sched_state(ce); > > > > if (pending_enable || destroyed || deregister) { > > > > @@ -542,6 +573,10 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc) > > > > /* Not mutualy exclusive with above if statement. */ > > > > if (pending_disable) { > > > > guc_signal_context_fence(ce); > > > > + if (banned) { > > > > + guc_cancel_context_requests(ce); > > > > + intel_engine_signal_breadcrumbs(ce->engine); > > > > + } > > > > intel_context_sched_disable_unpin(ce); > > > > atomic_dec(&guc->outstanding_submission_g2h); > > > > intel_context_put(ce); > > > > @@ -661,6 +696,9 @@ static void guc_reset_state(struct intel_context *ce, u32 head, bool scrub) > > > > { > > > > struct intel_engine_cs *engine = __context_to_physical_engine(ce); > > > > + if (intel_context_is_banned(ce)) > > > > + return; > > > > + > > > > GEM_BUG_ON(!intel_context_is_pinned(ce)); > > > > /* > > > > @@ -731,6 +769,8 @@ static void __guc_reset_context(struct intel_context *ce, bool stalled) > > > > struct i915_request *rq; > > > > u32 head; > > > > + intel_context_get(ce); > > > > + > > > > /* > > > > * GuC will implicitly mark the context as non-schedulable > > > > * when it sends the reset notification. Make sure our state > > > > @@ -756,6 +796,7 @@ static void __guc_reset_context(struct intel_context *ce, bool stalled) > > > > out_replay: > > > > guc_reset_state(ce, head, stalled); > > > > __unwind_incomplete_requests(ce); > > > > + intel_context_put(ce); > > > > } > > > > void intel_guc_submission_reset(struct intel_guc *guc, bool stalled) > > > > @@ -940,8 +981,6 @@ static int guc_bypass_tasklet_submit(struct intel_guc *guc, > > > > ret = guc_add_request(guc, rq); > > > > if (ret == -EBUSY) > > > > guc->stalled_request = rq; > > > > - else > > > > - trace_i915_request_guc_submit(rq); > > > > if (unlikely(ret == -EPIPE)) > > > > disable_submission(guc); > > > > @@ -1344,13 +1383,77 @@ static u16 prep_context_pending_disable(struct intel_context *ce) > > > > return ce->guc_id; > > > > } > > > > +static void __guc_context_set_preemption_timeout(struct intel_guc *guc, > > > > + u16 guc_id, > > > > + u32 preemption_timeout) > > > > +{ > > > > + u32 action[] = { > > > > + INTEL_GUC_ACTION_SET_CONTEXT_PREEMPTION_TIMEOUT, > > > > + guc_id, > > > > + preemption_timeout > > > > + }; > > > > + > > > > + intel_guc_send_busy_loop(guc, action, ARRAY_SIZE(action), 0, true); > > > > +} > > > > + > > > > +static void guc_context_ban(struct intel_context *ce, struct i915_request *rq) > > > > +{ > > > > + struct intel_guc *guc = ce_to_guc(ce); > > > > + struct intel_runtime_pm *runtime_pm = > > > > + &ce->engine->gt->i915->runtime_pm; > > > > + intel_wakeref_t wakeref; > > > > + unsigned long flags; > > > > + > > > > + guc_flush_submissions(guc); > > > > + > > > > + spin_lock_irqsave(&ce->guc_state.lock, flags); > > > > + set_context_banned(ce); > > > > + > > > > + if (submission_disabled(guc) || > > > > + (!context_enabled(ce) && !context_pending_disable(ce))) { > > > > + spin_unlock_irqrestore(&ce->guc_state.lock, flags); > > > > + > > > > + guc_cancel_context_requests(ce); > > > > + intel_engine_signal_breadcrumbs(ce->engine); > > > > + } else if (!context_pending_disable(ce)) { > > > > + u16 guc_id; > > > > + > > > > + /* > > > > + * We add +2 here as the schedule disable complete CTB handler > > > > + * calls intel_context_sched_disable_unpin (-2 to pin_count). > > > > + */ > > > > + atomic_add(2, &ce->pin_count); > > > > + > > > > + guc_id = prep_context_pending_disable(ce); > > > > + spin_unlock_irqrestore(&ce->guc_state.lock, flags); > > > > + > > > > + /* > > > > + * In addition to disabling scheduling, set the preemption > > > > + * timeout to the minimum value (1 us) so the banned context > > > > + * gets kicked off the HW ASAP. > > > > + */ > > > > + with_intel_runtime_pm(runtime_pm, wakeref) { > > > > + __guc_context_set_preemption_timeout(guc, guc_id, 1); > > > > + __guc_context_sched_disable(guc, ce, guc_id); > > > > + } > > > > + } else { > > > > + if (!context_guc_id_invalid(ce)) > > > > + with_intel_runtime_pm(runtime_pm, wakeref) > > > > + __guc_context_set_preemption_timeout(guc, > > > > + ce->guc_id, > > > > + 1); > > > > + spin_unlock_irqrestore(&ce->guc_state.lock, flags); > > > > + } > > > > +} > > > > + > > > > static void guc_context_sched_disable(struct intel_context *ce) > > > > { > > > > struct intel_guc *guc = ce_to_guc(ce); > > > > - struct intel_runtime_pm *runtime_pm = &ce->engine->gt->i915->runtime_pm; > > > > unsigned long flags; > > > > - u16 guc_id; > > > > + struct intel_runtime_pm *runtime_pm = &ce->engine->gt->i915->runtime_pm; > > > > intel_wakeref_t wakeref; > > > > + u16 guc_id; > > > > + bool enabled; > > > > if (submission_disabled(guc) || context_guc_id_invalid(ce) || > > > > !lrc_desc_registered(guc, ce->guc_id)) { > > > > @@ -1364,14 +1467,22 @@ static void guc_context_sched_disable(struct intel_context *ce) > > > > spin_lock_irqsave(&ce->guc_state.lock, flags); > > > > /* > > > > - * We have to check if the context has been pinned again as another pin > > > > - * operation is allowed to pass this function. Checking the pin count, > > > > - * within ce->guc_state.lock, synchronizes this function with > > > > + * We have to check if the context has been disabled by another thread. > > > > + * We also have to check if the context has been pinned again as another > > > > + * pin operation is allowed to pass this function. Checking the pin > > > > + * count, within ce->guc_state.lock, synchronizes this function with > > > > * guc_request_alloc ensuring a request doesn't slip through the > > > > * 'context_pending_disable' fence. Checking within the spin lock (can't > > > > * sleep) ensures another process doesn't pin this context and generate > > > > * a request before we set the 'context_pending_disable' flag here. > > > > */ > > > > + enabled = context_enabled(ce); > > > > + if (unlikely(!enabled || submission_disabled(guc))) { > > > > + if (enabled) > > > > + clr_context_enabled(ce); > > > > + spin_unlock_irqrestore(&ce->guc_state.lock, flags); > > > > + goto unpin; > > > > + } > > > > if (unlikely(atomic_add_unless(&ce->pin_count, -2, 2))) { > > > > spin_unlock_irqrestore(&ce->guc_state.lock, flags); > > > > return; > > > > @@ -1529,6 +1640,8 @@ static const struct intel_context_ops guc_context_ops = { > > > > .unpin = guc_context_unpin, > > > > .post_unpin = guc_context_post_unpin, > > > > + .ban = guc_context_ban, > > > > + > > > > .enter = intel_context_enter_engine, > > > > .exit = intel_context_exit_engine, > > > > @@ -1722,6 +1835,8 @@ static const struct intel_context_ops virtual_guc_context_ops = { > > > > .unpin = guc_context_unpin, > > > > .post_unpin = guc_context_post_unpin, > > > > + .ban = guc_context_ban, > > > > + > > > > .enter = guc_virtual_context_enter, > > > > .exit = guc_virtual_context_exit, > > > > @@ -2164,6 +2279,8 @@ int intel_guc_sched_done_process_msg(struct intel_guc *guc, > > > > if (context_pending_enable(ce)) { > > > > clr_context_pending_enable(ce); > > > > } else if (context_pending_disable(ce)) { > > > > + bool banned; > > > > + > > > > /* > > > > * Unpin must be done before __guc_signal_context_fence, > > > > * otherwise a race exists between the requests getting > > > > @@ -2174,9 +2291,16 @@ int intel_guc_sched_done_process_msg(struct intel_guc *guc, > > > > intel_context_sched_disable_unpin(ce); > > > > spin_lock_irqsave(&ce->guc_state.lock, flags); > > > > + banned = context_banned(ce); > > > > + clr_context_banned(ce); > > > > clr_context_pending_disable(ce); > > > > __guc_signal_context_fence(ce); > > > > spin_unlock_irqrestore(&ce->guc_state.lock, flags); > > > > + > > > > + if (banned) { > > > > + guc_cancel_context_requests(ce); > > > > + intel_engine_signal_breadcrumbs(ce->engine); > > > > + } > > > > } > > > > decr_outstanding_submission_g2h(guc); > > > > @@ -2211,8 +2335,11 @@ static void guc_handle_context_reset(struct intel_guc *guc, > > > > struct intel_context *ce) > > > > { > > > > trace_intel_context_reset(ce); > > > > - capture_error_state(guc, ce); > > > > - guc_context_replay(ce); > > > > + > > > > + if (likely(!intel_context_is_banned(ce))) { > > > > + capture_error_state(guc, ce); > > > > + guc_context_replay(ce); > > > > + } > > > > } > > > > int intel_guc_context_reset_process_msg(struct intel_guc *guc, > > > > diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h > > > > index 3f43d904f043..9613a7c19661 100644 > > > > --- a/drivers/gpu/drm/i915/i915_trace.h > > > > +++ b/drivers/gpu/drm/i915/i915_trace.h > > > > @@ -925,6 +925,11 @@ DEFINE_EVENT(intel_context, intel_context_reset, > > > > TP_ARGS(ce) > > > > ); > > > > +DEFINE_EVENT(intel_context, intel_context_ban, > > > > + TP_PROTO(struct intel_context *ce), > > > > + TP_ARGS(ce) > > > > +); > > > > + > > > > DEFINE_EVENT(intel_context, intel_context_register, > > > > TP_PROTO(struct intel_context *ce), > > > > TP_ARGS(ce) > > > > @@ -1017,6 +1022,11 @@ trace_intel_context_reset(struct intel_context *ce) > > > > { > > > > } > > > > +static inline void > > > > +trace_intel_context_ban(struct intel_context *ce) > > > > +{ > > > > +} > > > > + > > > > static inline void > > > > trace_intel_context_register(struct intel_context *ce) > > > > { > > > >