> -----Original Message----- > From: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Sent: Monday, August 26, 2019 12:22 AM > To: intel-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>; Joonas Lahtinen > <joonas.lahtinen@xxxxxxxxxxxxxxx>; Winiarski, Michal > <michal.winiarski@xxxxxxxxx>; Bloomfield, Jon <jon.bloomfield@xxxxxxxxx> > Subject: [PATCH 27/28] drm/i915: Cancel non-persistent contexts on close > > Normally, we rely on our hangcheck to prevent persistent batches from > hogging the GPU. However, if the user disables hangcheck, this mechanism > breaks down. Despite our insistence that this is unsafe, the users are > equally insistent that they want to use endless batches and will disable > the hangcheck mechanism. We are looking are perhaps replacing hangcheck "looking at"? > with a softer mechanism, that sends a pulse down the engine to check if > it is well. We can use the same preemptive pulse to flush an active > persistent context off the GPU upon context close, preventing resources > being lost and unkillable requests remaining on the GPU after process > termination. To avoid changing the ABI and accidentally breaking > existing userspace, we make the persistence of a context explicit and > enable it by default (matching current ABI). Userspace can opt out of > persistent mode (forcing requests to be cancelled when the context is > closed by process termination or explicitly) by a context parameter. To > facilitate existing use-cases of disabling hangcheck, if the modparam is > disabled (i915.enable_hangcheck=0), we disable peristence mode by > default. (Note, one of the outcomes for supporting endless mode will be > the removal of hangchecking, at which point opting into persistent mode > will be mandatory, or maybe the default perhaps controlled by cgroups.) > > Testcase: igt/gem_ctx_persistence > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Joonas Lahtinen <joonas.lahtinen@xxxxxxxxxxxxxxx> > Cc: Michał Winiarski <michal.winiarski@xxxxxxxxx> > Cc: Jon Bloomfield <jon.bloomfield@xxxxxxxxx> > --- > drivers/gpu/drm/i915/Makefile | 3 +- > drivers/gpu/drm/i915/gem/i915_gem_context.c | 122 ++++++++++++++++++ > drivers/gpu/drm/i915/gem/i915_gem_context.h | 15 +++ > .../gpu/drm/i915/gem/i915_gem_context_types.h | 1 + > .../gpu/drm/i915/gt/intel_engine_heartbeat.c | 54 ++++++++ > .../gpu/drm/i915/gt/intel_engine_heartbeat.h | 14 ++ > drivers/gpu/drm/i915/gt/intel_engine_pm.c | 2 +- > drivers/gpu/drm/i915/i915_priolist_types.h | 1 + > include/uapi/drm/i915_drm.h | 15 +++ > 9 files changed, 225 insertions(+), 2 deletions(-) > create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c > create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h > > diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile > index 658b930d34a8..eaa74e000985 100644 > --- a/drivers/gpu/drm/i915/Makefile > +++ b/drivers/gpu/drm/i915/Makefile > @@ -76,8 +76,9 @@ gt-y += \ > gt/intel_breadcrumbs.o \ > gt/intel_context.o \ > gt/intel_engine_cs.o \ > - gt/intel_engine_pool.o \ > + gt/intel_engine_heartbeat.o \ > gt/intel_engine_pm.o \ > + gt/intel_engine_pool.o \ > gt/intel_engine_user.o \ > gt/intel_gt.o \ > gt/intel_gt_irq.o \ > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c > b/drivers/gpu/drm/i915/gem/i915_gem_context.c > index bd9397669332..5520a896e701 100644 > --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c > +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c > @@ -70,6 +70,7 @@ > #include <drm/i915_drm.h> > > #include "gt/intel_lrc_reg.h" > +#include "gt/intel_engine_heartbeat.h" > #include "gt/intel_engine_user.h" > > #include "i915_gem_context.h" > @@ -375,6 +376,78 @@ void i915_gem_context_release(struct kref *ref) > queue_work(i915->wq, &i915->contexts.free_work); > } > > +static inline struct i915_gem_engines * > +__context_engines_static(struct i915_gem_context *ctx) > +{ > + return rcu_dereference_protected(ctx->engines, true); > +} > + > +static void kill_context(struct i915_gem_context *ctx) > +{ > + intel_engine_mask_t tmp, active, reset; > + struct intel_gt *gt = &ctx->i915->gt; > + struct i915_gem_engines_iter it; > + struct intel_engine_cs *engine; > + struct intel_context *ce; > + > + /* > + * If we are already banned, it was due to a guilty request causing > + * a reset and the entire context being evicted from the GPU. > + */ > + if (i915_gem_context_is_banned(ctx)) > + return; > + > + i915_gem_context_set_banned(ctx); > + > + /* > + * Map the user's engine back to the actual engines; one virtual > + * engine will be mapped to multiple engines, and using ctx->engine[] > + * the same engine may be have multiple instances in the user's map. > + * However, we only care about pending requests, so only include > + * engines on which there are incomplete requests. > + */ > + active = 0; > + for_each_gem_engine(ce, __context_engines_static(ctx), it) { > + struct dma_fence *fence; > + > + if (!ce->timeline) > + continue; > + > + fence = i915_active_fence_get(&ce->timeline->last_request); > + if (!fence) > + continue; > + > + engine = to_request(fence)->engine; > + if (HAS_EXECLISTS(gt->i915)) > + engine = intel_context_inflight(ce); > + if (engine) > + active |= engine->mask; > + > + dma_fence_put(fence); > + } > + > + /* > + * Send a "high priority pulse" down the engine to cause the > + * current request to be momentarily preempted. (If it fails to > + * be preempted, it will be reset). As we have marked our context > + * as banned, any incomplete request, including any running, will > + * be skipped following the preemption. > + */ > + reset = 0; > + for_each_engine_masked(engine, gt->i915, active, tmp) > + if (intel_engine_pulse(engine)) > + reset |= engine->mask; > + > + /* > + * If we are unable to send a preemptive pulse to bump > + * the context from the GPU, we have to resort to a full > + * reset. We hope the collateral damage is worth it. > + */ > + if (reset) > + intel_gt_handle_error(gt, reset, 0, > + "context closure in %s", ctx->name); This seems inconsistent with the policy not to allow non-persistence without pre-emption, since if we can't pre-empt we nuke anyway. But this feels unsafe to me - How does intel_gt_handle_error prevent us from nuking a following context, instead of the target? Ideally we would: 1) Unqueue any context currently behind the target context 2) Reset engine only if the target context is running (it could complete during 1) 3) Requeue other contexts If the above is viable (?) we don't even need to attempt pre-emption. > +} > + > static void context_close(struct i915_gem_context *ctx) > { > i915_gem_context_set_closed(ctx); > @@ -400,6 +473,10 @@ static void context_close(struct i915_gem_context > *ctx) > lut_close(ctx); > > mutex_unlock(&ctx->mutex); > + > + if (!i915_gem_context_is_persistent(ctx)) > + kill_context(ctx); > + > i915_gem_context_put(ctx); > } > > @@ -440,6 +517,21 @@ __create_context(struct drm_i915_private *i915) > i915_gem_context_set_bannable(ctx); > i915_gem_context_set_recoverable(ctx); > > + /* > + * If the user has disabled hangchecking, we can not be sure that > + * the batches will ever complete after the context is closed, > + * keep the context and all resources pinned forever. So in this > + * case we opt to forcibly kill off all remaining requests on > + * context close. > + * > + * Note that the user may chance the value of the modparam between s/chance/change/ > + * context creation and close, we choose to ignore this for the > + * sake of determinism and expect the user to set the parameter > + * on module load and never touch it again. > + */ > + if (i915_modparams.enable_hangcheck) /* cgroup hook? */ > + i915_gem_context_set_persistence(ctx); > + > for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp); i++) > ctx->hang_timestamp[i] = jiffies - > CONTEXT_FAST_HANG_JIFFIES; > > @@ -598,6 +690,7 @@ i915_gem_context_create_kernel(struct > drm_i915_private *i915, int prio) > } > > i915_gem_context_clear_bannable(ctx); > + i915_gem_context_set_persistence(ctx); > ctx->sched.priority = I915_USER_PRIORITY(prio); > > GEM_BUG_ON(!i915_gem_context_is_kernel(ctx)); > @@ -1730,6 +1823,26 @@ get_engines(struct i915_gem_context *ctx, > return err; > } > > +static int > +set_persistence(struct i915_gem_context *ctx, > + const struct drm_i915_gem_context_param *args) > +{ > + if (args->size) > + return -EINVAL; > + > + if (args->value) { > + i915_gem_context_set_persistence(ctx); > + return 0; > + } > + > + /* To cancel a context we use "preempt-to-idle" */ > + if (!(ctx->i915->caps.scheduler & Why do we need to give up on older devices? If you fail to preempt you reset the context anyway, so can't we just use the reset fallback path? > I915_SCHEDULER_CAP_PREEMPTION)) > + return -ENODEV; > + > + i915_gem_context_clear_persistence(ctx); > + return 0; > +} > + > static int ctx_setparam(struct drm_i915_file_private *fpriv, > struct i915_gem_context *ctx, > struct drm_i915_gem_context_param *args) > @@ -1807,6 +1920,10 @@ static int ctx_setparam(struct > drm_i915_file_private *fpriv, > ret = set_engines(ctx, args); > break; > > + case I915_CONTEXT_PARAM_PERSISTENCE: > + ret = set_persistence(ctx, args); > + break; > + > case I915_CONTEXT_PARAM_BAN_PERIOD: > default: > ret = -EINVAL; > @@ -2258,6 +2375,11 @@ int i915_gem_context_getparam_ioctl(struct > drm_device *dev, void *data, > ret = get_engines(ctx, args); > break; > > + case I915_CONTEXT_PARAM_PERSISTENCE: > + args->size = 0; > + args->value = i915_gem_context_is_persistent(ctx); > + break; > + > case I915_CONTEXT_PARAM_BAN_PERIOD: > default: > ret = -EINVAL; > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.h > b/drivers/gpu/drm/i915/gem/i915_gem_context.h > index 176978608b6f..e0f5b6c6a331 100644 > --- a/drivers/gpu/drm/i915/gem/i915_gem_context.h > +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.h > @@ -74,6 +74,21 @@ static inline void > i915_gem_context_clear_recoverable(struct i915_gem_context *c > clear_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags); > } > > +static inline bool i915_gem_context_is_persistent(const struct > i915_gem_context *ctx) > +{ > + return test_bit(UCONTEXT_PERSISTENCE, &ctx->user_flags); > +} > + > +static inline void i915_gem_context_set_persistence(struct i915_gem_context > *ctx) > +{ > + set_bit(UCONTEXT_PERSISTENCE, &ctx->user_flags); > +} > + > +static inline void i915_gem_context_clear_persistence(struct > i915_gem_context *ctx) > +{ > + clear_bit(UCONTEXT_PERSISTENCE, &ctx->user_flags); > +} > + > static inline bool i915_gem_context_is_banned(const struct i915_gem_context > *ctx) > { > return test_bit(CONTEXT_BANNED, &ctx->flags); > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h > b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h > index 260d59cc3de8..daf1ea5075a6 100644 > --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h > +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h > @@ -137,6 +137,7 @@ struct i915_gem_context { > #define UCONTEXT_NO_ERROR_CAPTURE 1 > #define UCONTEXT_BANNABLE 2 > #define UCONTEXT_RECOVERABLE 3 > +#define UCONTEXT_PERSISTENCE 4 > > /** > * @flags: small set of booleans > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c > b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c > new file mode 100644 > index 000000000000..43d1370eaa7f > --- /dev/null > +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c > @@ -0,0 +1,54 @@ > +/* > + * SPDX-License-Identifier: MIT > + * > + * Copyright © 2019 Intel Corporation > + */ > + > +#include "i915_request.h" > + > +#include "intel_context.h" > +#include "intel_engine_heartbeat.h" > +#include "intel_engine_pm.h" > +#include "intel_engine.h" > +#include "intel_gt.h" > + > +static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq) > +{ > + engine->wakeref_serial = READ_ONCE(engine->serial) + 1; > + i915_request_add_active_barriers(rq); > +} > + > +int intel_engine_pulse(struct intel_engine_cs *engine) > +{ > + struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER }; > + struct intel_context *ce = engine->kernel_context; > + struct i915_request *rq; > + int err = 0; > + > + if (!intel_engine_has_preemption(engine)) > + return -ENODEV; > + > + if (!intel_engine_pm_get_if_awake(engine)) > + return 0; > + > + mutex_lock(&ce->timeline->mutex); > + > + intel_context_enter(ce); > + rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN); > + intel_context_exit(ce); > + if (IS_ERR(rq)) { > + err = PTR_ERR(rq); > + goto out_unlock; > + } > + > + rq->flags |= I915_REQUEST_SENTINEL; > + idle_pulse(engine, rq); > + > + __i915_request_commit(rq); > + __i915_request_queue(rq, &attr); > + > +out_unlock: > + mutex_unlock(&ce->timeline->mutex); > + intel_engine_pm_put(engine); > + return err; > +} > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h > b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h > new file mode 100644 > index 000000000000..b950451b5998 > --- /dev/null > +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h > @@ -0,0 +1,14 @@ > +/* > + * SPDX-License-Identifier: MIT > + * > + * Copyright © 2019 Intel Corporation > + */ > + > +#ifndef INTEL_ENGINE_HEARTBEAT_H > +#define INTEL_ENGINE_HEARTBEAT_H > + > +struct intel_engine_cs; > + > +int intel_engine_pulse(struct intel_engine_cs *engine); > + > +#endif /* INTEL_ENGINE_HEARTBEAT_H */ > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c > b/drivers/gpu/drm/i915/gt/intel_engine_pm.c > index 472b2259f629..12a2608a8889 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c > +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c > @@ -110,7 +110,7 @@ static bool switch_to_kernel_context(struct > intel_engine_cs *engine) > i915_request_add_active_barriers(rq); > > /* Install ourselves as a preemption barrier */ > - rq->sched.attr.priority = I915_PRIORITY_UNPREEMPTABLE; > + rq->sched.attr.priority = I915_PRIORITY_BARRIER; > __i915_request_commit(rq); > > /* Release our exclusive hold on the engine */ > diff --git a/drivers/gpu/drm/i915/i915_priolist_types.h > b/drivers/gpu/drm/i915/i915_priolist_types.h > index 21037a2e2038..ae8bb3cb627e 100644 > --- a/drivers/gpu/drm/i915/i915_priolist_types.h > +++ b/drivers/gpu/drm/i915/i915_priolist_types.h > @@ -39,6 +39,7 @@ enum { > * active request. > */ > #define I915_PRIORITY_UNPREEMPTABLE INT_MAX > +#define I915_PRIORITY_BARRIER INT_MAX > > #define __NO_PREEMPTION (I915_PRIORITY_WAIT) > > diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h > index 469dc512cca3..dbc8691d75d0 100644 > --- a/include/uapi/drm/i915_drm.h > +++ b/include/uapi/drm/i915_drm.h > @@ -1565,6 +1565,21 @@ struct drm_i915_gem_context_param { > * i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND) > */ > #define I915_CONTEXT_PARAM_ENGINES 0xa > + > +/* > + * I915_CONTEXT_PARAM_PERSISTENCE: > + * > + * Allow the context and active rendering to survive the process until > + * completion. Persistence allows fire-and-forget clients to queue up a > + * bunch of work, hand the output over to a display server and the quit. > + * If the context is not marked as persistent, upon closing (either via > + * an explicit DRM_I915_GEM_CONTEXT_DESTROY or implicitly from file > closure > + * or process termination), the context and any outstanding requests will be > + * cancelled (and exported fences for cancelled requests marked as -EIO). > + * > + * By default, new contexts allow persistence. > + */ > +#define I915_CONTEXT_PARAM_PERSISTENCE 0xb > /* Must be kept compact -- no holes and well documented */ > > __u64 value; > -- > 2.23.0 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx