Re: [PATCH] drm/i915: Report if an unbannable context is involved in a GPU hang

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes:

> Since unbannable contexts are special and supposed not to be causing GPU
> hangs in the first place, make it clear when they are implicated in said
> hang. In practice, most unbannable contexts are those created by igt
> for the express purpose of throwing untold thousands of hangs at the GPU
> and wish to keep doing so to finish the test. Normally they are cleaned
> up, but it's when they or the other unbannable kernel contexts stay
> stuck in an erroneous state that we need to worry and so need
> highlighting.
>
> Suggested-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx>
> Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
> Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx

+>

Well, this should make things obvious if this happens.

Reviewed-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx>

> ---
>  drivers/gpu/drm/i915/i915_drv.h       |  1 +
>  drivers/gpu/drm/i915/i915_gpu_error.c | 21 +++++++++++++++------
>  2 files changed, 16 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 4e158aab36d6..d6b5ac2a563d 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -555,6 +555,7 @@ struct i915_gpu_state {
>  			int ban_score;
>  			int active;
>  			int guilty;
> +			bool bannable;
>  		} context;
>  
>  		struct drm_i915_error_object {
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index a81351d9e3a6..67c902412193 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -396,6 +396,11 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m,
>  			   ee->instdone.row[slice][subslice]);
>  }
>  
> +static const char *bannable(const struct drm_i915_error_context *ctx)
> +{
> +	return ctx->bannable ? "" : " (unbannable)";
> +}
> +
>  static void error_print_request(struct drm_i915_error_state_buf *m,
>  				const char *prefix,
>  				const struct drm_i915_error_request *erq)
> @@ -414,9 +419,10 @@ static void error_print_context(struct drm_i915_error_state_buf *m,
>  				const char *header,
>  				const struct drm_i915_error_context *ctx)
>  {
> -	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d guilty %d active %d\n",
> +	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d%s guilty %d active %d\n",
>  		   header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
> -		   ctx->priority, ctx->ban_score, ctx->guilty, ctx->active);
> +		   ctx->priority, ctx->ban_score, bannable(ctx),
> +		   ctx->guilty, ctx->active);
>  }
>  
>  static void error_print_engine(struct drm_i915_error_state_buf *m,
> @@ -644,11 +650,12 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
>  		if (error->engine[i].hangcheck_stalled &&
>  		    error->engine[i].context.pid) {
> -			err_printf(m, "Active process (on ring %s): %s [%d], score %d\n",
> +			err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
>  				   engine_name(m->i915, i),
>  				   error->engine[i].context.comm,
>  				   error->engine[i].context.pid,
> -				   error->engine[i].context.ban_score);
> +				   error->engine[i].context.ban_score,
> +				   bannable(&error->engine[i].context));
>  		}
>  	}
>  	err_printf(m, "Reset count: %u\n", error->reset_count);
> @@ -736,12 +743,13 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  		if (obj) {
>  			err_puts(m, dev_priv->engine[i]->name);
>  			if (ee->context.pid)
> -				err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d)",
> +				err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d%s)",
>  					   ee->context.comm,
>  					   ee->context.pid,
>  					   ee->context.handle,
>  					   ee->context.hw_id,
> -					   ee->context.ban_score);
> +					   ee->context.ban_score,
> +					   bannable(&ee->context));
>  			err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
>  				   upper_32_bits(obj->gtt_offset),
>  				   lower_32_bits(obj->gtt_offset));
> @@ -1383,6 +1391,7 @@ static void record_context(struct drm_i915_error_context *e,
>  	e->hw_id = ctx->hw_id;
>  	e->priority = ctx->priority;
>  	e->ban_score = atomic_read(&ctx->ban_score);
> +	e->bannable = i915_gem_context_is_bannable(ctx);
>  	e->guilty = atomic_read(&ctx->guilty_count);
>  	e->active = atomic_read(&ctx->active_count);
>  }
> -- 
> 2.15.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux