Re: [PATCH v2] drm/i915: Optionally disable automatic recovery after a GPU reset

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes:

> Some clients, such as mesa, may only emit minimal incremental batches
> that rely on the logical context state from previous batches. They know
> that recovery is impossible after a hang as their required GPU state is
> lost, and that each in flight and subsequent batch will hang (resetting
> the context image back to default perpetuating the problem).
>
> To avoid getting into the state in the first place, we can allow clients
> to opt out of automatic recovery and elect to ban any guilty context
> following a hang. This prevents the continual stream of hangs and allows
> the client to recreate their context and rebuild the state from scratch.
>
> v2: Prefer calling it recoverable rather than unrecoverable.
>

Ok. You went forward with this. I am fine with both
flavours,

Reviewed-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx>

> Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
> Cc: Kenneth Graunke <kenneth@xxxxxxxxxxxxx>
> Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxx>
> ---
>  drivers/gpu/drm/i915/i915_gem.c         |  3 ++-
>  drivers/gpu/drm/i915/i915_gem_context.c | 13 +++++++++++++
>  drivers/gpu/drm/i915/i915_gem_context.h | 16 ++++++++++++++++
>  include/uapi/drm/i915_drm.h             | 20 ++++++++++++++++++++
>  4 files changed, 51 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 7d45e71100bc..9b02d70fbabf 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3018,7 +3018,8 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
>  
>  	bannable = i915_gem_context_is_bannable(ctx);
>  	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
> -	banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
> +	banned = (!i915_gem_context_is_recoverable(ctx) ||
> +		  score >= CONTEXT_SCORE_BAN_THRESHOLD);
>  
>  	/* Cool contexts don't accumulate client ban score */
>  	if (!bannable)
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 8cbe58070561..b31da0375c8c 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -338,6 +338,7 @@ __create_hw_context(struct drm_i915_private *dev_priv,
>  	list_add_tail(&ctx->link, &dev_priv->contexts.list);
>  	ctx->i915 = dev_priv;
>  	ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_NORMAL);
> +	ctx->user_flags = BIT(UCONTEXT_RECOVERABLE);
>  
>  	for (n = 0; n < ARRAY_SIZE(ctx->__engine); n++) {
>  		struct intel_context *ce = &ctx->__engine[n];
> @@ -878,6 +879,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>  	case I915_CONTEXT_PARAM_BANNABLE:
>  		args->value = i915_gem_context_is_bannable(ctx);
>  		break;
> +	case I915_CONTEXT_PARAM_RECOVERABLE:
> +		args->value = i915_gem_context_is_recoverable(ctx);
> +		break;
>  	case I915_CONTEXT_PARAM_PRIORITY:
>  		args->value = ctx->sched.priority >> I915_USER_PRIORITY_SHIFT;
>  		break;
> @@ -933,6 +937,15 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>  			i915_gem_context_clear_bannable(ctx);
>  		break;
>  
> +	case I915_CONTEXT_PARAM_RECOVERABLE:
> +		if (args->size)
> +			ret = -EINVAL;
> +		else if (args->value)
> +			i915_gem_context_set_recoverable(ctx);
> +		else
> +			i915_gem_context_clear_recoverable(ctx);
> +		break;
> +
>  	case I915_CONTEXT_PARAM_PRIORITY:
>  		{
>  			s64 priority = args->value;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 08165f6a0a84..397d50b826d7 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -123,6 +123,7 @@ struct i915_gem_context {
>  #define UCONTEXT_NO_ZEROMAP		0
>  #define UCONTEXT_NO_ERROR_CAPTURE	1
>  #define UCONTEXT_BANNABLE		2
> +#define UCONTEXT_RECOVERABLE		3
>  
>  	/**
>  	 * @flags: small set of booleans
> @@ -247,6 +248,21 @@ static inline void i915_gem_context_clear_bannable(struct i915_gem_context *ctx)
>  	clear_bit(UCONTEXT_BANNABLE, &ctx->user_flags);
>  }
>  
> +static inline bool i915_gem_context_is_recoverable(const struct i915_gem_context *ctx)
> +{
> +	return test_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags);
> +}
> +
> +static inline void i915_gem_context_set_recoverable(struct i915_gem_context *ctx)
> +{
> +	set_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags);
> +}
> +
> +static inline void i915_gem_context_clear_recoverable(struct i915_gem_context *ctx)
> +{
> +	clear_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags);
> +}
> +
>  static inline bool i915_gem_context_is_banned(const struct i915_gem_context *ctx)
>  {
>  	return test_bit(CONTEXT_BANNED, &ctx->flags);
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 298b2e197744..eeb258a12b95 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1486,6 +1486,26 @@ struct drm_i915_gem_context_param {
>  #define   I915_CONTEXT_MAX_USER_PRIORITY	1023 /* inclusive */
>  #define   I915_CONTEXT_DEFAULT_PRIORITY		0
>  #define   I915_CONTEXT_MIN_USER_PRIORITY	-1023 /* inclusive */
> +
> +/*
> + * Not all clients may want to attempt automatic recover of a context after
> + * a hang (for example, some clients may only submit very small incremental
> + * batches relying on known logical state of previous batches which will never
> + * recover correctly and each attempt will hang), and so would prefer that
> + * the context is forever banned instead.
> + *
> + * If set to false (0), after a reset, subsequent (and in flight) rendering
> + * from this context is discarded, and the client will need to create a new
> + * context to use instead.
> + *
> + * If set to true (1), the kernel will automatically attempt to recover the
> + * context by skipping the hanging batch and executing the next batch starting
> + * from the default context state (discarding the incomplete logical context
> + * state lost due to the reset).
> + *
> + * On creation, all new contexts are marked as recoverable.
> + */
> +#define I915_CONTEXT_PARAM_RECOVERABLE	0x7
>  	__u64 value;
>  };
>  
> -- 
> 2.19.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx




[Index of Archives]     [AMD Graphics]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux