Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes: > Currently, we accumulate each time a context hangs the GPU, offset > against the number of requests it submits, and if that score exceeds a > certain threshold, we ban that context from submitting any more requests > (cancelling any work in flight). In contrast, we use a simple timer on > the file, that if we see more than a 9 hangs faster than 60s apart in > total across all of its contexts, we will ban the client from creating > any more contexts. This leads to a confusing situation where the file > may be banned before the context, so lets use a simple timer scheme for > each. > > If the context submits 3 hanging requests within a 120s period, declare > it forbidden to ever send more requests. > > This has the advantage of not being easy to repair by simply sending > empty requests, but has the disadvantage that if the context is idle > then it is forgiven. However, if the context is idle, it is not > disrupting the system, but a hog can evade the request counting and > cause much more severe disruption to the system. > > Updating ban_score from request retirement is dubious as the retirement > is purposely not in sync with request submission (i.e. we try and batch > retirement to reduce overhead and avoid latency on submission), which > leads to surprising situations where we can forgive a hang immediately > due to a backlog of requests from before the hang being retired > afterwards. > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_gem_context.c | 4 ++++ > drivers/gpu/drm/i915/i915_gem_context.h | 9 +++---- > drivers/gpu/drm/i915/i915_gpu_error.c | 31 +++++++------------------ > drivers/gpu/drm/i915/i915_gpu_error.h | 3 --- > drivers/gpu/drm/i915/i915_request.c | 2 -- > drivers/gpu/drm/i915/i915_reset.c | 27 ++++++++++++--------- > 6 files changed, 33 insertions(+), 43 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c > index da21c843fed8..7541c6f961b3 100644 > --- a/drivers/gpu/drm/i915/i915_gem_context.c > +++ b/drivers/gpu/drm/i915/i915_gem_context.c > @@ -355,6 +355,7 @@ __create_hw_context(struct drm_i915_private *dev_priv, > struct i915_gem_context *ctx; > unsigned int n; > int ret; > + int i; > > ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); > if (ctx == NULL) > @@ -407,6 +408,9 @@ __create_hw_context(struct drm_i915_private *dev_priv, > ctx->desc_template = > default_desc_template(dev_priv, dev_priv->mm.aliasing_ppgtt); > > + for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp); i++) > + ctx->hang_timestamp[i] = jiffies - CONTEXT_FAST_HANG_JIFFIES; > + > return ctx; > > err_pid: > diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h > index 071108d34ae0..dc6c58f38cfa 100644 > --- a/drivers/gpu/drm/i915/i915_gem_context.h > +++ b/drivers/gpu/drm/i915/i915_gem_context.h > @@ -209,10 +209,11 @@ struct i915_gem_context { > */ > atomic_t active_count; > > -#define CONTEXT_SCORE_GUILTY 10 > -#define CONTEXT_SCORE_BAN_THRESHOLD 40 > - /** ban_score: Accumulated score of all hangs caused by this context. */ > - atomic_t ban_score; > + /** > + * @hang_timestamp: The last time(s) this context caused a GPU hang > + */ > + unsigned long hang_timestamp[2]; > +#define CONTEXT_FAST_HANG_JIFFIES (120 * HZ) /* 3 hangs within 120s? Banned! */ > > /** remap_slice: Bitmask of cache lines that need remapping */ > u8 remap_slice; > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c > index 9a65341fec09..3f6eddb6f6de 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.c > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c > @@ -434,11 +434,6 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m, > ee->instdone.row[slice][subslice]); > } > > -static const char *bannable(const struct drm_i915_error_context *ctx) > -{ > - return ctx->bannable ? "" : " (unbannable)"; > -} > - > static void error_print_request(struct drm_i915_error_state_buf *m, > const char *prefix, > const struct drm_i915_error_request *erq, > @@ -447,9 +442,8 @@ static void error_print_request(struct drm_i915_error_state_buf *m, > if (!erq->seqno) > return; > > - err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n", > - prefix, erq->pid, erq->ban_score, > - erq->context, erq->seqno, > + err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n", > + prefix, erq->pid, erq->context, erq->seqno, > test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, > &erq->flags) ? "!" : "", > test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, > @@ -463,10 +457,9 @@ static void error_print_context(struct drm_i915_error_state_buf *m, > const char *header, > const struct drm_i915_error_context *ctx) > { > - err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d%s guilty %d active %d\n", > + err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, guilty %d active %d\n", > header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id, > - ctx->sched_attr.priority, ctx->ban_score, bannable(ctx), > - ctx->guilty, ctx->active); > + ctx->sched_attr.priority, ctx->guilty, ctx->active); > } > > static void error_print_engine(struct drm_i915_error_state_buf *m, > @@ -688,12 +681,10 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, > if (!error->engine[i].context.pid) > continue; > > - err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n", > + err_printf(m, "Active process (on ring %s): %s [%d]\n", > engine_name(m->i915, i), > error->engine[i].context.comm, > - error->engine[i].context.pid, > - error->engine[i].context.ban_score, > - bannable(&error->engine[i].context)); > + error->engine[i].context.pid); > } > err_printf(m, "Reset count: %u\n", error->reset_count); > err_printf(m, "Suspend count: %u\n", error->suspend_count); > @@ -779,13 +770,11 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, > if (obj) { > err_puts(m, m->i915->engine[i]->name); > if (ee->context.pid) > - err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d%s)", > + err_printf(m, " (submitted by %s [%d], ctx %d [%d])", > ee->context.comm, > ee->context.pid, > ee->context.handle, > - ee->context.hw_id, > - ee->context.ban_score, > - bannable(&ee->context)); > + ee->context.hw_id); > err_printf(m, " --- gtt_offset = 0x%08x %08x\n", > upper_32_bits(obj->gtt_offset), > lower_32_bits(obj->gtt_offset)); > @@ -1301,8 +1290,6 @@ static void record_request(struct i915_request *request, > erq->flags = request->fence.flags; > erq->context = ctx->hw_id; > erq->sched_attr = request->sched.attr; > - erq->ban_score = atomic_read(&ctx->ban_score); > - erq->seqno = request->global_seqno; > erq->jiffies = request->emitted_jiffies; > erq->start = i915_ggtt_offset(request->ring->vma); > erq->head = request->head; > @@ -1396,8 +1383,6 @@ static void record_context(struct drm_i915_error_context *e, > e->handle = ctx->user_handle; > e->hw_id = ctx->hw_id; > e->sched_attr = ctx->sched; > - e->ban_score = atomic_read(&ctx->ban_score); > - e->bannable = i915_gem_context_is_bannable(ctx); > e->guilty = atomic_read(&ctx->guilty_count); > e->active = atomic_read(&ctx->active_count); > } > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h > index afa3adb28f02..94eaf8ab9051 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.h > +++ b/drivers/gpu/drm/i915/i915_gpu_error.h > @@ -122,10 +122,8 @@ struct i915_gpu_state { > pid_t pid; > u32 handle; > u32 hw_id; > - int ban_score; > int active; > int guilty; > - bool bannable; > struct i915_sched_attr sched_attr; > } context; > > @@ -149,7 +147,6 @@ struct i915_gpu_state { > long jiffies; > pid_t pid; > u32 context; > - int ban_score; > u32 seqno; > u32 start; > u32 head; > diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c > index 5ab4e1c01618..a61e3a4fc9dc 100644 > --- a/drivers/gpu/drm/i915/i915_request.c > +++ b/drivers/gpu/drm/i915/i915_request.c > @@ -290,8 +290,6 @@ static void i915_request_retire(struct i915_request *request) > > i915_request_remove_from_client(request); > > - /* Retirement decays the ban score as it is a sign of ctx progress */ > - atomic_dec_if_positive(&request->gem_context->ban_score); > intel_context_unpin(request->hw_context); > > __retire_engine_upto(request->engine, request); > diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c > index 1911e00d2581..bae88a4ea924 100644 > --- a/drivers/gpu/drm/i915/i915_reset.c > +++ b/drivers/gpu/drm/i915/i915_reset.c > @@ -59,24 +59,29 @@ static void client_mark_guilty(struct drm_i915_file_private *file_priv, > > static bool context_mark_guilty(struct i915_gem_context *ctx) > { > - unsigned int score; > - bool banned, bannable; > + unsigned long prev_hang; > + bool banned; > + int i; > > atomic_inc(&ctx->guilty_count); > > - bannable = i915_gem_context_is_bannable(ctx); > - score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score); > - banned = (!i915_gem_context_is_recoverable(ctx) || > - score >= CONTEXT_SCORE_BAN_THRESHOLD); > - > /* Cool contexts don't accumulate client ban score */ This comment becomes misleading and can be removed. > - if (!bannable) > + if (!i915_gem_context_is_bannable(ctx)) > return false; > > + /* Record the timestamp for the last N hangs */ > + prev_hang = ctx->hang_timestamp[0]; > + for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++) > + ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1]; > + ctx->hang_timestamp[i] = jiffies; > + > + /* If we have hung N+1 times in rapid succession, we ban the context! */ > + banned = !i915_gem_context_is_recoverable(ctx); > + if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES)) > + banned = true; Ok, the initialization to jiffies - CONTEXT_FAST_HANG_JIFFIES guarantees that it cant be banned even if it manages to immediately hang. Which in itself is difficult feat to accomplish. > if (banned) { > - DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n", > - ctx->name, atomic_read(&ctx->guilty_count), > - score); > + DRM_DEBUG_DRIVER("context %s: guilty %d, banned\n", > + ctx->name, > atomic_read(&ctx->guilty_count)); Now when we know when it previously hanged, we could improve the logging a bit by saying how long ago. But not sure about if it worth the trouble. With the misleading comment removed/corrected, Reviewed-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> > i915_gem_context_set_banned(ctx); > } > > -- > 2.20.1 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/intel-gfx _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx