Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes: > Some clients, such as mesa, may only emit minimal incremental batches > that rely on the logical context state from previous batches. They know > that recovery is impossible after a hang as their required GPU state is > lost, and that each in flight and subsequent batch will hang (resetting > the context image back to default perpetuating the problem). > > To avoid getting into the state in the first place, we can allow clients > to opt out of automatic recovery and elect to ban any guilty context > following a hang. This prevents the continual stream of hangs and allows > the client to recreate their context and rebuild the state from scratch. > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Kenneth Graunke <kenneth@xxxxxxxxxxxxx> > Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_gem.c | 3 ++- > drivers/gpu/drm/i915/i915_gem_context.c | 12 ++++++++++++ > drivers/gpu/drm/i915/i915_gem_context.h | 16 ++++++++++++++++ > include/uapi/drm/i915_drm.h | 20 ++++++++++++++++++++ > 4 files changed, 50 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index 7d45e71100bc..eee06d90d460 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -3018,7 +3018,8 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx) > > bannable = i915_gem_context_is_bannable(ctx); > score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score); > - banned = score >= CONTEXT_SCORE_BAN_THRESHOLD; > + banned = (i915_gem_context_is_unrecoverable(ctx) || > + score >= CONTEXT_SCORE_BAN_THRESHOLD); We treat banned contexts rather aggressively on client level banning scoring. Should we give some leeway if a client tells it don't need recovery, instead of being more harsh on them? As with this, third hang would lead to client ban. > > /* Cool contexts don't accumulate client ban score */ > if (!bannable) > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c > index 8cbe58070561..2d5e4119786a 100644 > --- a/drivers/gpu/drm/i915/i915_gem_context.c > +++ b/drivers/gpu/drm/i915/i915_gem_context.c > @@ -878,6 +878,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, > case I915_CONTEXT_PARAM_BANNABLE: > args->value = i915_gem_context_is_bannable(ctx); > break; > + case I915_CONTEXT_PARAM_RECOVERABLE: > + args->value = !i915_gem_context_is_unrecoverable(ctx); Pondering here why not the internal coding be also i915_gem_context_is_recoverable(ctx). Atleast in here would read better. -Mika > + break; > case I915_CONTEXT_PARAM_PRIORITY: > args->value = ctx->sched.priority >> I915_USER_PRIORITY_SHIFT; > break; > @@ -933,6 +936,15 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, > i915_gem_context_clear_bannable(ctx); > break; > > + case I915_CONTEXT_PARAM_RECOVERABLE: > + if (args->size) > + ret = -EINVAL; > + else if (args->value) > + i915_gem_context_clear_unrecoverable(ctx); > + else > + i915_gem_context_set_unrecoverable(ctx); > + break; > + > case I915_CONTEXT_PARAM_PRIORITY: > { > s64 priority = args->value; > diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h > index 08165f6a0a84..2d6b8b0307e5 100644 > --- a/drivers/gpu/drm/i915/i915_gem_context.h > +++ b/drivers/gpu/drm/i915/i915_gem_context.h > @@ -123,6 +123,7 @@ struct i915_gem_context { > #define UCONTEXT_NO_ZEROMAP 0 > #define UCONTEXT_NO_ERROR_CAPTURE 1 > #define UCONTEXT_BANNABLE 2 > +#define UCONTEXT_NO_RECOVERY 3 > > /** > * @flags: small set of booleans > @@ -247,6 +248,21 @@ static inline void i915_gem_context_clear_bannable(struct i915_gem_context *ctx) > clear_bit(UCONTEXT_BANNABLE, &ctx->user_flags); > } > > +static inline bool i915_gem_context_is_unrecoverable(const struct i915_gem_context *ctx) > +{ > + return test_bit(UCONTEXT_NO_RECOVERY, &ctx->user_flags); > +} > + > +static inline void i915_gem_context_set_unrecoverable(struct i915_gem_context *ctx) > +{ > + set_bit(UCONTEXT_NO_RECOVERY, &ctx->user_flags); > +} > + > +static inline void i915_gem_context_clear_unrecoverable(struct i915_gem_context *ctx) > +{ > + clear_bit(UCONTEXT_NO_RECOVERY, &ctx->user_flags); > +} > + > static inline bool i915_gem_context_is_banned(const struct i915_gem_context *ctx) > { > return test_bit(CONTEXT_BANNED, &ctx->flags); > diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h > index 298b2e197744..eeb258a12b95 100644 > --- a/include/uapi/drm/i915_drm.h > +++ b/include/uapi/drm/i915_drm.h > @@ -1486,6 +1486,26 @@ struct drm_i915_gem_context_param { > #define I915_CONTEXT_MAX_USER_PRIORITY 1023 /* inclusive */ > #define I915_CONTEXT_DEFAULT_PRIORITY 0 > #define I915_CONTEXT_MIN_USER_PRIORITY -1023 /* inclusive */ > + > +/* > + * Not all clients may want to attempt automatic recover of a context after > + * a hang (for example, some clients may only submit very small incremental > + * batches relying on known logical state of previous batches which will never > + * recover correctly and each attempt will hang), and so would prefer that > + * the context is forever banned instead. > + * > + * If set to false (0), after a reset, subsequent (and in flight) rendering > + * from this context is discarded, and the client will need to create a new > + * context to use instead. > + * > + * If set to true (1), the kernel will automatically attempt to recover the > + * context by skipping the hanging batch and executing the next batch starting > + * from the default context state (discarding the incomplete logical context > + * state lost due to the reset). > + * > + * On creation, all new contexts are marked as recoverable. > + */ > +#define I915_CONTEXT_PARAM_RECOVERABLE 0x7 > __u64 value; > }; > > -- > 2.19.0 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/intel-gfx _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx