> -----Original Message----- > From: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Sent: Tuesday, August 6, 2019 6:47 AM > To: intel-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>; Joonas Lahtinen > <joonas.lahtinen@xxxxxxxxxxxxxxx>; Winiarski, Michal > <michal.winiarski@xxxxxxxxx>; Bloomfield, Jon <jon.bloomfield@xxxxxxxxx> > Subject: [PATCH 5/5] drm/i915: Cancel non-persistent contexts on close > > Normally, we rely on our hangcheck to prevent persistent batches from > hogging the GPU. However, if the user disables hangcheck, this mechanism > breaks down. Despite our insistence that this is unsafe, the users are > equally insistent that they want to use endless batches and will disable > the hangcheck mechanism. We are looking are perhaps replacing hangcheck > with a softer mechanism, that sends a pulse down the engine to check if > it is well. We can use the same preemptive pulse to flush an active > persistent context off the GPU upon context close, preventing resources > being lost and unkillable requests remaining on the GPU, after process > termination. To avoid changing the ABI and accidentally breaking > existing userspace, we make the persistence of a context explicit and > enable it by default. Userspace can opt out of persistent mode (forcing > requests to be cancelled when the context is closed by process > termination or explicitly) by a context parameter, or to facilitate > existing use-cases by disabling hangcheck (i915.enable_hangcheck=0). > (Note, one of the outcomes for supporting endless mode will be the > removal of hangchecking, at which point opting into persistent mode will > be mandatory, or maybe the default.) > > Testcase: igt/gem_ctx_persistence > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Joonas Lahtinen <joonas.lahtinen@xxxxxxxxxxxxxxx> > Cc: Michał Winiarski <michal.winiarski@xxxxxxxxx> > Cc: Jon Bloomfield <jon.bloomfield@xxxxxxxxx> > > --- > Same sort of caveats as for hangcheck, a few corner cases need > struct_mutex and some preallocation. > --- > drivers/gpu/drm/i915/Makefile | 3 +- > drivers/gpu/drm/i915/gem/i915_gem_context.c | 79 +++++++++++++++++++ > drivers/gpu/drm/i915/gem/i915_gem_context.h | 15 ++++ > .../gpu/drm/i915/gem/i915_gem_context_types.h | 1 + > .../gpu/drm/i915/gt/intel_engine_heartbeat.c | 53 +++++++++++++ > .../gpu/drm/i915/gt/intel_engine_heartbeat.h | 14 ++++ > include/uapi/drm/i915_drm.h | 15 ++++ > 7 files changed, 179 insertions(+), 1 deletion(-) > create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c > create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h > > diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile > index a1016858d014..42417d87496e 100644 > --- a/drivers/gpu/drm/i915/Makefile > +++ b/drivers/gpu/drm/i915/Makefile > @@ -71,9 +71,10 @@ obj-y += gt/ > gt-y += \ > gt/intel_breadcrumbs.o \ > gt/intel_context.o \ > - gt/intel_engine_pool.o \ > gt/intel_engine_cs.o \ > + gt/intel_engine_heartbeat.o \ > gt/intel_engine_pm.o \ > + gt/intel_engine_pool.o \ > gt/intel_gt.o \ > gt/intel_gt_pm.o \ > gt/intel_hangcheck.o \ > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c > b/drivers/gpu/drm/i915/gem/i915_gem_context.c > index 64f7a533e886..5718b74f95b7 100644 > --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c > +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c > @@ -70,6 +70,7 @@ > #include <drm/i915_drm.h> > > #include "gt/intel_lrc_reg.h" > +#include "gt/intel_engine_heartbeat.h" > > #include "i915_gem_context.h" > #include "i915_globals.h" > @@ -373,6 +374,45 @@ void i915_gem_context_release(struct kref *ref) > queue_work(i915->wq, &i915->contexts.free_work); > } > > +static void kill_context(struct i915_gem_context *ctx) > +{ > + struct i915_gem_engines_iter it; > + struct intel_engine_cs *engine; > + intel_engine_mask_t tmp, active; > + struct intel_context *ce; > + > + if (i915_gem_context_is_banned(ctx)) > + return; > + > + i915_gem_context_set_banned(ctx); > + > + active = 0; > + for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) { > + struct i915_request *rq; > + > + if (!ce->ring) > + continue; > + > + rq = i915_active_request_get_unlocked(&ce->ring->timeline- > >last_request); > + if (!rq) > + continue; > + > + active |= rq->engine->mask; > + i915_request_put(rq); > + } > + i915_gem_context_unlock_engines(ctx); > + > + /* > + * Send a "high priority pulse" down the engine to cause the > + * current request to be momentarily preempted. (If it fails to > + * be preempted, it will be reset). As we have marked our context > + * as banned, any incomplete request, including any running, will > + * be skipped. > + */ > + for_each_engine_masked(engine, ctx->i915, active, tmp) > + intel_engine_pulse(engine); > +} > + > static void context_close(struct i915_gem_context *ctx) > { > mutex_lock(&ctx->mutex); > @@ -394,6 +434,15 @@ static void context_close(struct i915_gem_context > *ctx) > lut_close(ctx); > > mutex_unlock(&ctx->mutex); > + > + /* > + * If the user has disabled hangchecking, we can not be sure that > + * the batches will ever complete and let the context be freed. > + * Forcibly kill off any remaining requests in this case. > + */ Persistence is independent of hangcheck which makes the above comment unrepresentative of the below code. Do we even need a comment here, it looks self-explanatory? Instead maybe comment the defaulting state in __createContext() to make the connection between hangcheck and persistence? > + if (!i915_gem_context_is_persistent(ctx)) > + kill_context(ctx); > + > i915_gem_context_put(ctx); > } > > @@ -433,6 +482,8 @@ __create_context(struct drm_i915_private *i915) > > i915_gem_context_set_bannable(ctx); > i915_gem_context_set_recoverable(ctx); > + if (i915_modparams.enable_hangcheck) > + i915_gem_context_set_persistence(ctx); > > ctx->ring_size = 4 * PAGE_SIZE; > > @@ -1745,6 +1796,25 @@ get_engines(struct i915_gem_context *ctx, > return err; > } > > +static int > +set_persistence(struct i915_gem_context *ctx, > + const struct drm_i915_gem_context_param *args) > +{ > + if (args->size) > + return -EINVAL; > + > + if (!args->value) { > + i915_gem_context_clear_persistence(ctx); > + return 0; > + } > + > + if (!(ctx->i915->caps.scheduler & > I915_SCHEDULER_CAP_PREEMPTION)) > + return -ENODEV; > + > + i915_gem_context_set_persistence(ctx); > + return 0; > +} > + > static int ctx_setparam(struct drm_i915_file_private *fpriv, > struct i915_gem_context *ctx, > struct drm_i915_gem_context_param *args) > @@ -1822,6 +1892,10 @@ static int ctx_setparam(struct > drm_i915_file_private *fpriv, > ret = set_engines(ctx, args); > break; > > + case I915_CONTEXT_PARAM_PERSISTENCE: > + ret = set_persistence(ctx, args); > + break; > + > case I915_CONTEXT_PARAM_BAN_PERIOD: > default: > ret = -EINVAL; > @@ -2278,6 +2352,11 @@ int i915_gem_context_getparam_ioctl(struct > drm_device *dev, void *data, > ret = get_engines(ctx, args); > break; > > + case I915_CONTEXT_PARAM_PERSISTENCE: > + args->size = 0; > + args->value = i915_gem_context_is_persistent(ctx); > + break; > + > case I915_CONTEXT_PARAM_BAN_PERIOD: > default: > ret = -EINVAL; > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.h > b/drivers/gpu/drm/i915/gem/i915_gem_context.h > index 106e2ccf7a4c..1834d1df2ac9 100644 > --- a/drivers/gpu/drm/i915/gem/i915_gem_context.h > +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.h > @@ -74,6 +74,21 @@ static inline void > i915_gem_context_clear_recoverable(struct i915_gem_context *c > clear_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags); > } > > +static inline bool i915_gem_context_is_persistent(const struct > i915_gem_context *ctx) > +{ > + return test_bit(UCONTEXT_PERSISTENCE, &ctx->user_flags); > +} > + > +static inline void i915_gem_context_set_persistence(struct i915_gem_context > *ctx) > +{ > + set_bit(UCONTEXT_PERSISTENCE, &ctx->user_flags); > +} > + > +static inline void i915_gem_context_clear_persistence(struct > i915_gem_context *ctx) > +{ > + clear_bit(UCONTEXT_PERSISTENCE, &ctx->user_flags); > +} > + > static inline bool i915_gem_context_is_banned(const struct i915_gem_context > *ctx) > { > return test_bit(CONTEXT_BANNED, &ctx->flags); > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h > b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h > index a02d98494078..bf41b43ffa9a 100644 > --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h > +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h > @@ -137,6 +137,7 @@ struct i915_gem_context { > #define UCONTEXT_NO_ERROR_CAPTURE 1 > #define UCONTEXT_BANNABLE 2 > #define UCONTEXT_RECOVERABLE 3 > +#define UCONTEXT_PERSISTENCE 4 > > /** > * @flags: small set of booleans > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c > b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c > new file mode 100644 > index 000000000000..0c0632a423a7 > --- /dev/null > +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c > @@ -0,0 +1,53 @@ > +/* > + * SPDX-License-Identifier: MIT > + * > + * Copyright © 2019 Intel Corporation > + */ > + > +#include "i915_request.h" > + > +#include "intel_context.h" > +#include "intel_engine_heartbeat.h" > +#include "intel_engine_pm.h" > +#include "intel_engine.h" > +#include "intel_gt.h" > + > +void intel_engine_pulse(struct intel_engine_cs *engine) > +{ > + struct intel_context *ce = engine->kernel_context; > + struct i915_sched_attr attr = { .priority = INT_MAX }; > + struct i915_request *rq; > + int err = 0; > + > + GEM_BUG_ON(!engine->schedule); > + > + if (!intel_engine_pm_get_if_awake(engine)) > + return; > + > + mutex_lock(&ce->ring->timeline->mutex); > + > + intel_context_enter(ce); > + rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN); > + intel_context_exit(ce); > + if (IS_ERR(rq)) { > + err = PTR_ERR(rq); > + goto out_unlock; > + } > + i915_request_get(rq); > + > + rq->flags |= I915_REQUEST_SENTINEL; > + __i915_request_commit(rq); > + > + local_bh_disable(); > + engine->schedule(rq, &attr); > + local_bh_enable(); > + > + i915_request_put(rq); > + > +out_unlock: > + mutex_unlock(&ce->ring->timeline->mutex); > + intel_context_timeline_unlock(ce); > + intel_engine_pm_put(engine); > + if (err) /* XXX must not be allowed to fail */ > + DRM_ERROR("Failed to ping %s, err=%d\n", engine->name, > err); > +} > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h > b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h > new file mode 100644 > index 000000000000..86761748dc21 > --- /dev/null > +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h > @@ -0,0 +1,14 @@ > +/* > + * SPDX-License-Identifier: MIT > + * > + * Copyright © 2019 Intel Corporation > + */ > + > +#ifndef INTEL_ENGINE_HEARTBEAT_H > +#define INTEL_ENGINE_HEARTBEAT_H > + > +struct intel_engine_cs; > + > +void intel_engine_pulse(struct intel_engine_cs *engine); > + > +#endif /* INTEL_ENGINE_HEARTBEAT_H */ > diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h > index 469dc512cca3..dbc8691d75d0 100644 > --- a/include/uapi/drm/i915_drm.h > +++ b/include/uapi/drm/i915_drm.h > @@ -1565,6 +1565,21 @@ struct drm_i915_gem_context_param { > * i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND) > */ > #define I915_CONTEXT_PARAM_ENGINES 0xa > + > +/* > + * I915_CONTEXT_PARAM_PERSISTENCE: > + * > + * Allow the context and active rendering to survive the process until > + * completion. Persistence allows fire-and-forget clients to queue up a > + * bunch of work, hand the output over to a display server and the quit. > + * If the context is not marked as persistent, upon closing (either via > + * an explicit DRM_I915_GEM_CONTEXT_DESTROY or implicitly from file > closure > + * or process termination), the context and any outstanding requests will be > + * cancelled (and exported fences for cancelled requests marked as -EIO). > + * > + * By default, new contexts allow persistence. > + */ > +#define I915_CONTEXT_PARAM_PERSISTENCE 0xb > /* Must be kept compact -- no holes and well documented */ > > __u64 value; > -- > 2.23.0.rc1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx