Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes: > Some clients, such as mesa, may only emit minimal incremental batches > that rely on the logical context state from previous batches. They know > that recovery is impossible after a hang as their required GPU state is > lost, and that each in flight and subsequent batch will hang (resetting > the context image back to default perpetuating the problem). > > To avoid getting into the state in the first place, we can allow clients > to opt out of automatic recovery and elect to ban any guilty context > following a hang. This prevents the continual stream of hangs and allows > the client to recreate their context and rebuild the state from scratch. > > v2: Prefer calling it recoverable rather than unrecoverable. > Ok. You went forward with this. I am fine with both flavours, Reviewed-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Kenneth Graunke <kenneth@xxxxxxxxxxxxx> > Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_gem.c | 3 ++- > drivers/gpu/drm/i915/i915_gem_context.c | 13 +++++++++++++ > drivers/gpu/drm/i915/i915_gem_context.h | 16 ++++++++++++++++ > include/uapi/drm/i915_drm.h | 20 ++++++++++++++++++++ > 4 files changed, 51 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index 7d45e71100bc..9b02d70fbabf 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -3018,7 +3018,8 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx) > > bannable = i915_gem_context_is_bannable(ctx); > score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score); > - banned = score >= CONTEXT_SCORE_BAN_THRESHOLD; > + banned = (!i915_gem_context_is_recoverable(ctx) || > + score >= CONTEXT_SCORE_BAN_THRESHOLD); > > /* Cool contexts don't accumulate client ban score */ > if (!bannable) > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c > index 8cbe58070561..b31da0375c8c 100644 > --- a/drivers/gpu/drm/i915/i915_gem_context.c > +++ b/drivers/gpu/drm/i915/i915_gem_context.c > @@ -338,6 +338,7 @@ __create_hw_context(struct drm_i915_private *dev_priv, > list_add_tail(&ctx->link, &dev_priv->contexts.list); > ctx->i915 = dev_priv; > ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_NORMAL); > + ctx->user_flags = BIT(UCONTEXT_RECOVERABLE); > > for (n = 0; n < ARRAY_SIZE(ctx->__engine); n++) { > struct intel_context *ce = &ctx->__engine[n]; > @@ -878,6 +879,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, > case I915_CONTEXT_PARAM_BANNABLE: > args->value = i915_gem_context_is_bannable(ctx); > break; > + case I915_CONTEXT_PARAM_RECOVERABLE: > + args->value = i915_gem_context_is_recoverable(ctx); > + break; > case I915_CONTEXT_PARAM_PRIORITY: > args->value = ctx->sched.priority >> I915_USER_PRIORITY_SHIFT; > break; > @@ -933,6 +937,15 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, > i915_gem_context_clear_bannable(ctx); > break; > > + case I915_CONTEXT_PARAM_RECOVERABLE: > + if (args->size) > + ret = -EINVAL; > + else if (args->value) > + i915_gem_context_set_recoverable(ctx); > + else > + i915_gem_context_clear_recoverable(ctx); > + break; > + > case I915_CONTEXT_PARAM_PRIORITY: > { > s64 priority = args->value; > diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h > index 08165f6a0a84..397d50b826d7 100644 > --- a/drivers/gpu/drm/i915/i915_gem_context.h > +++ b/drivers/gpu/drm/i915/i915_gem_context.h > @@ -123,6 +123,7 @@ struct i915_gem_context { > #define UCONTEXT_NO_ZEROMAP 0 > #define UCONTEXT_NO_ERROR_CAPTURE 1 > #define UCONTEXT_BANNABLE 2 > +#define UCONTEXT_RECOVERABLE 3 > > /** > * @flags: small set of booleans > @@ -247,6 +248,21 @@ static inline void i915_gem_context_clear_bannable(struct i915_gem_context *ctx) > clear_bit(UCONTEXT_BANNABLE, &ctx->user_flags); > } > > +static inline bool i915_gem_context_is_recoverable(const struct i915_gem_context *ctx) > +{ > + return test_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags); > +} > + > +static inline void i915_gem_context_set_recoverable(struct i915_gem_context *ctx) > +{ > + set_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags); > +} > + > +static inline void i915_gem_context_clear_recoverable(struct i915_gem_context *ctx) > +{ > + clear_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags); > +} > + > static inline bool i915_gem_context_is_banned(const struct i915_gem_context *ctx) > { > return test_bit(CONTEXT_BANNED, &ctx->flags); > diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h > index 298b2e197744..eeb258a12b95 100644 > --- a/include/uapi/drm/i915_drm.h > +++ b/include/uapi/drm/i915_drm.h > @@ -1486,6 +1486,26 @@ struct drm_i915_gem_context_param { > #define I915_CONTEXT_MAX_USER_PRIORITY 1023 /* inclusive */ > #define I915_CONTEXT_DEFAULT_PRIORITY 0 > #define I915_CONTEXT_MIN_USER_PRIORITY -1023 /* inclusive */ > + > +/* > + * Not all clients may want to attempt automatic recover of a context after > + * a hang (for example, some clients may only submit very small incremental > + * batches relying on known logical state of previous batches which will never > + * recover correctly and each attempt will hang), and so would prefer that > + * the context is forever banned instead. > + * > + * If set to false (0), after a reset, subsequent (and in flight) rendering > + * from this context is discarded, and the client will need to create a new > + * context to use instead. > + * > + * If set to true (1), the kernel will automatically attempt to recover the > + * context by skipping the hanging batch and executing the next batch starting > + * from the default context state (discarding the incomplete logical context > + * state lost due to the reset). > + * > + * On creation, all new contexts are marked as recoverable. > + */ > +#define I915_CONTEXT_PARAM_RECOVERABLE 0x7 > __u64 value; > }; > > -- > 2.19.0 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/intel-gfx _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx