Re: [PATCH 2/2] drm/i915: Use time based guilty context banning

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes:

> Currently, we accumulate each time a context hangs the GPU, offset
> against the number of requests it submits, and if that score exceeds a
> certain threshold, we ban that context from submitting any more requests
> (cancelling any work in flight). In contrast, we use a simple timer on
> the file, that if we see more than a 9 hangs faster than 60s apart in
> total across all of its contexts, we will ban the client from creating
> any more contexts. This leads to a confusing situation where the file
> may be banned before the context, so lets use a simple timer scheme for
> each.
>
> If the context submits 3 hanging requests within a 120s period, declare
> it forbidden to ever send more requests.
>
> This has the advantage of not being easy to repair by simply sending
> empty requests, but has the disadvantage that if the context is idle
> then it is forgiven. However, if the context is idle, it is not
> disrupting the system, but a hog can evade the request counting and
> cause much more severe disruption to the system.
>
> Updating ban_score from request retirement is dubious as the retirement
> is purposely not in sync with request submission (i.e. we try and batch
> retirement to reduce overhead and avoid latency on submission), which
> leads to surprising situations where we can forgive a hang immediately
> due to a backlog of requests from before the hang being retired
> afterwards.
>
> Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
> Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxx>
> ---
>  drivers/gpu/drm/i915/i915_gem_context.c |  4 ++++
>  drivers/gpu/drm/i915/i915_gem_context.h |  9 +++----
>  drivers/gpu/drm/i915/i915_gpu_error.c   | 31 +++++++------------------
>  drivers/gpu/drm/i915/i915_gpu_error.h   |  3 ---
>  drivers/gpu/drm/i915/i915_request.c     |  2 --
>  drivers/gpu/drm/i915/i915_reset.c       | 27 ++++++++++++---------
>  6 files changed, 33 insertions(+), 43 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index da21c843fed8..7541c6f961b3 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -355,6 +355,7 @@ __create_hw_context(struct drm_i915_private *dev_priv,
>  	struct i915_gem_context *ctx;
>  	unsigned int n;
>  	int ret;
> +	int i;
>  
>  	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
>  	if (ctx == NULL)
> @@ -407,6 +408,9 @@ __create_hw_context(struct drm_i915_private *dev_priv,
>  	ctx->desc_template =
>  		default_desc_template(dev_priv, dev_priv->mm.aliasing_ppgtt);
>  
> +	for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp); i++)
> +		ctx->hang_timestamp[i] = jiffies - CONTEXT_FAST_HANG_JIFFIES;
> +
>  	return ctx;
>  
>  err_pid:
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 071108d34ae0..dc6c58f38cfa 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -209,10 +209,11 @@ struct i915_gem_context {
>  	 */
>  	atomic_t active_count;
>  
> -#define CONTEXT_SCORE_GUILTY		10
> -#define CONTEXT_SCORE_BAN_THRESHOLD	40
> -	/** ban_score: Accumulated score of all hangs caused by this context. */
> -	atomic_t ban_score;
> +	/**
> +	 * @hang_timestamp: The last time(s) this context caused a GPU hang
> +	 */
> +	unsigned long hang_timestamp[2];
> +#define CONTEXT_FAST_HANG_JIFFIES (120 * HZ) /* 3 hangs within 120s? Banned! */
>  
>  	/** remap_slice: Bitmask of cache lines that need remapping */
>  	u8 remap_slice;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 9a65341fec09..3f6eddb6f6de 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -434,11 +434,6 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m,
>  			   ee->instdone.row[slice][subslice]);
>  }
>  
> -static const char *bannable(const struct drm_i915_error_context *ctx)
> -{
> -	return ctx->bannable ? "" : " (unbannable)";
> -}
> -
>  static void error_print_request(struct drm_i915_error_state_buf *m,
>  				const char *prefix,
>  				const struct drm_i915_error_request *erq,
> @@ -447,9 +442,8 @@ static void error_print_request(struct drm_i915_error_state_buf *m,
>  	if (!erq->seqno)
>  		return;
>  
> -	err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
> -		   prefix, erq->pid, erq->ban_score,
> -		   erq->context, erq->seqno,
> +	err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
> +		   prefix, erq->pid, erq->context, erq->seqno,
>  		   test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
>  			    &erq->flags) ? "!" : "",
>  		   test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
> @@ -463,10 +457,9 @@ static void error_print_context(struct drm_i915_error_state_buf *m,
>  				const char *header,
>  				const struct drm_i915_error_context *ctx)
>  {
> -	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d%s guilty %d active %d\n",
> +	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, guilty %d active %d\n",
>  		   header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
> -		   ctx->sched_attr.priority, ctx->ban_score, bannable(ctx),
> -		   ctx->guilty, ctx->active);
> +		   ctx->sched_attr.priority, ctx->guilty, ctx->active);
>  }
>  
>  static void error_print_engine(struct drm_i915_error_state_buf *m,
> @@ -688,12 +681,10 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
>  		if (!error->engine[i].context.pid)
>  			continue;
>  
> -		err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
> +		err_printf(m, "Active process (on ring %s): %s [%d]\n",
>  			   engine_name(m->i915, i),
>  			   error->engine[i].context.comm,
> -			   error->engine[i].context.pid,
> -			   error->engine[i].context.ban_score,
> -			   bannable(&error->engine[i].context));
> +			   error->engine[i].context.pid);
>  	}
>  	err_printf(m, "Reset count: %u\n", error->reset_count);
>  	err_printf(m, "Suspend count: %u\n", error->suspend_count);
> @@ -779,13 +770,11 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
>  		if (obj) {
>  			err_puts(m, m->i915->engine[i]->name);
>  			if (ee->context.pid)
> -				err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d%s)",
> +				err_printf(m, " (submitted by %s [%d], ctx %d [%d])",
>  					   ee->context.comm,
>  					   ee->context.pid,
>  					   ee->context.handle,
> -					   ee->context.hw_id,
> -					   ee->context.ban_score,
> -					   bannable(&ee->context));
> +					   ee->context.hw_id);
>  			err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
>  				   upper_32_bits(obj->gtt_offset),
>  				   lower_32_bits(obj->gtt_offset));
> @@ -1301,8 +1290,6 @@ static void record_request(struct i915_request *request,
>  	erq->flags = request->fence.flags;
>  	erq->context = ctx->hw_id;
>  	erq->sched_attr = request->sched.attr;
> -	erq->ban_score = atomic_read(&ctx->ban_score);
> -	erq->seqno = request->global_seqno;
>  	erq->jiffies = request->emitted_jiffies;
>  	erq->start = i915_ggtt_offset(request->ring->vma);
>  	erq->head = request->head;
> @@ -1396,8 +1383,6 @@ static void record_context(struct drm_i915_error_context *e,
>  	e->handle = ctx->user_handle;
>  	e->hw_id = ctx->hw_id;
>  	e->sched_attr = ctx->sched;
> -	e->ban_score = atomic_read(&ctx->ban_score);
> -	e->bannable = i915_gem_context_is_bannable(ctx);
>  	e->guilty = atomic_read(&ctx->guilty_count);
>  	e->active = atomic_read(&ctx->active_count);
>  }
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
> index afa3adb28f02..94eaf8ab9051 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -122,10 +122,8 @@ struct i915_gpu_state {
>  			pid_t pid;
>  			u32 handle;
>  			u32 hw_id;
> -			int ban_score;
>  			int active;
>  			int guilty;
> -			bool bannable;
>  			struct i915_sched_attr sched_attr;
>  		} context;
>  
> @@ -149,7 +147,6 @@ struct i915_gpu_state {
>  			long jiffies;
>  			pid_t pid;
>  			u32 context;
> -			int ban_score;
>  			u32 seqno;
>  			u32 start;
>  			u32 head;
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 5ab4e1c01618..a61e3a4fc9dc 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -290,8 +290,6 @@ static void i915_request_retire(struct i915_request *request)
>  
>  	i915_request_remove_from_client(request);
>  
> -	/* Retirement decays the ban score as it is a sign of ctx progress */
> -	atomic_dec_if_positive(&request->gem_context->ban_score);
>  	intel_context_unpin(request->hw_context);
>  
>  	__retire_engine_upto(request->engine, request);
> diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
> index 1911e00d2581..bae88a4ea924 100644
> --- a/drivers/gpu/drm/i915/i915_reset.c
> +++ b/drivers/gpu/drm/i915/i915_reset.c
> @@ -59,24 +59,29 @@ static void client_mark_guilty(struct drm_i915_file_private *file_priv,
>  
>  static bool context_mark_guilty(struct i915_gem_context *ctx)
>  {
> -	unsigned int score;
> -	bool banned, bannable;
> +	unsigned long prev_hang;
> +	bool banned;
> +	int i;
>  
>  	atomic_inc(&ctx->guilty_count);
>  
> -	bannable = i915_gem_context_is_bannable(ctx);
> -	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
> -	banned = (!i915_gem_context_is_recoverable(ctx) ||
> -		  score >= CONTEXT_SCORE_BAN_THRESHOLD);
> -
>  	/* Cool contexts don't accumulate client ban score */

This comment becomes misleading and can be removed.

> -	if (!bannable)
> +	if (!i915_gem_context_is_bannable(ctx))
>  		return false;
>  
> +	/* Record the timestamp for the last N hangs */
> +	prev_hang = ctx->hang_timestamp[0];
> +	for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++)
> +		ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1];
> +	ctx->hang_timestamp[i] = jiffies;
> +
> +	/* If we have hung N+1 times in rapid succession, we ban the context! */
> +	banned = !i915_gem_context_is_recoverable(ctx);
> +	if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
> +		banned = true;

Ok, the initialization to jiffies - CONTEXT_FAST_HANG_JIFFIES guarantees
that it cant be banned even if it manages to immediately hang. Which
in itself is difficult feat to accomplish.

>  	if (banned) {
> -		DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
> -				 ctx->name, atomic_read(&ctx->guilty_count),
> -				 score);
> +		DRM_DEBUG_DRIVER("context %s: guilty %d, banned\n",
> +				 ctx->name,
> atomic_read(&ctx->guilty_count));

Now when we know when it previously hanged, we could improve the
logging a bit by saying how long ago. But not sure
about if it worth the trouble.

With the misleading comment removed/corrected,
Reviewed-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx>


>  		i915_gem_context_set_banned(ctx);
>  	}
>  
> -- 
> 2.20.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx




[Index of Archives]     [AMD Graphics]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux