To flush idle barriers, and even inflight requests, we want to send a preemptive 'pulse' along an engine. We use a no-op request along the pinned kernel_context at high priority so that it should run or else kick off the stuck requests. We can use this to ensure idle barriers are immediately flushed, as part of a context cancellation mechanism, or as part of a heartbeat mechanism to detect and reset a stuck GPU. Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> --- drivers/gpu/drm/i915/Makefile | 3 +- .../gpu/drm/i915/gt/intel_engine_heartbeat.c | 77 +++++++++ .../gpu/drm/i915/gt/intel_engine_heartbeat.h | 15 ++ drivers/gpu/drm/i915/gt/intel_engine_pm.c | 2 +- .../drm/i915/gt/selftest_engine_heartbeat.c | 159 ++++++++++++++++++ drivers/gpu/drm/i915/i915_active.c | 1 + drivers/gpu/drm/i915/i915_priolist_types.h | 1 + .../drm/i915/selftests/i915_live_selftests.h | 1 + 8 files changed, 257 insertions(+), 2 deletions(-) create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h create mode 100644 drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index a16a2daef977..2fd4bed188e5 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -78,8 +78,9 @@ gt-y += \ gt/intel_breadcrumbs.o \ gt/intel_context.o \ gt/intel_engine_cs.o \ - gt/intel_engine_pool.o \ + gt/intel_engine_heartbeat.o \ gt/intel_engine_pm.o \ + gt/intel_engine_pool.o \ gt/intel_engine_user.o \ gt/intel_gt.o \ gt/intel_gt_irq.o \ diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c new file mode 100644 index 000000000000..4b9ab7813d54 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c @@ -0,0 +1,77 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include "i915_request.h" + +#include "intel_context.h" +#include "intel_engine_heartbeat.h" +#include "intel_engine_pm.h" +#include "intel_engine.h" +#include "intel_gt.h" + +static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq) +{ + engine->wakeref_serial = READ_ONCE(engine->serial) + 1; + i915_request_add_active_barriers(rq); +} + +int intel_engine_pulse(struct intel_engine_cs *engine) +{ + struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER }; + struct intel_context *ce = engine->kernel_context; + struct i915_request *rq; + int err = 0; + + if (!intel_engine_has_preemption(engine)) + return -ENODEV; + + if (!intel_engine_pm_get_if_awake(engine)) + return 0; + + if (mutex_lock_interruptible(&ce->timeline->mutex)) + goto out_rpm; + + intel_context_enter(ce); + rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN); + intel_context_exit(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out_unlock; + } + + rq->flags |= I915_REQUEST_SENTINEL; + idle_pulse(engine, rq); + + __i915_request_commit(rq); + __i915_request_queue(rq, &attr); + +out_unlock: + mutex_unlock(&ce->timeline->mutex); +out_rpm: + intel_engine_pm_put(engine); + return err; +} + +int intel_engine_flush_barriers(struct intel_engine_cs *engine) +{ + struct i915_request *rq; + + if (llist_empty(&engine->barrier_tasks)) + return 0; + + rq = i915_request_create(engine->kernel_context); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + idle_pulse(engine, rq); + i915_request_add(rq); + + return 0; +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_engine_heartbeat.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h new file mode 100644 index 000000000000..b334e5aaf78d --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h @@ -0,0 +1,15 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef INTEL_ENGINE_HEARTBEAT_H +#define INTEL_ENGINE_HEARTBEAT_H + +struct intel_engine_cs; + +int intel_engine_pulse(struct intel_engine_cs *engine); +int intel_engine_flush_barriers(struct intel_engine_cs *engine); + +#endif /* INTEL_ENGINE_HEARTBEAT_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c index 67eb6183648a..7d76611d9df1 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c @@ -111,7 +111,7 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine) i915_request_add_active_barriers(rq); /* Install ourselves as a preemption barrier */ - rq->sched.attr.priority = I915_PRIORITY_UNPREEMPTABLE; + rq->sched.attr.priority = I915_PRIORITY_BARRIER; __i915_request_commit(rq); /* Release our exclusive hold on the engine */ diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c new file mode 100644 index 000000000000..1f5ab59ad6e7 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c @@ -0,0 +1,159 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2018 Intel Corporation + */ + +#include "i915_drv.h" + +#include "intel_gt_requests.h" +#include "i915_selftest.h" + +struct pulse { + struct i915_active active; + struct kref kref; +}; + +static int pulse_active(struct i915_active *active) +{ + kref_get(&container_of(active, struct pulse, active)->kref); + return 0; +} + +static void pulse_free(struct kref *kref) +{ + kfree(container_of(kref, struct pulse, kref)); +} + +static void pulse_put(struct pulse *p) +{ + kref_put(&p->kref, pulse_free); +} + +static void pulse_retire(struct i915_active *active) +{ + pulse_put(container_of(active, struct pulse, active)); +} + +static struct pulse *pulse_create(void) +{ + struct pulse *p; + + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return p; + + kref_init(&p->kref); + i915_active_init(&p->active, pulse_active, pulse_retire); + + return p; +} + +static int __live_idle_pulse(struct intel_engine_cs *engine, + int (*fn)(struct intel_engine_cs *cs)) +{ + struct pulse *p; + int err; + + p = pulse_create(); + if (!p) + return -ENOMEM; + + err = i915_active_acquire(&p->active); + if (err) + goto out; + + err = i915_active_acquire_preallocate_barrier(&p->active, engine); + if (err) { + i915_active_release(&p->active); + goto out; + } + + i915_active_acquire_barrier(&p->active); + i915_active_release(&p->active); + + GEM_BUG_ON(i915_active_is_idle(&p->active)); + + err = fn(engine); + if (err) + goto out; + + if (intel_gt_retire_requests_timeout(engine->gt, HZ / 5)) { + err = -ETIME; + goto out; + } + + if (!i915_active_is_idle(&p->active)) { + pr_err("%s: heartbeat pulse did not flush idle tasks\n", + engine->name); + err = -EINVAL; + goto out; + } + +out: + pulse_put(p); + return err; +} + +static int live_idle_flush(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + /* Check that we can flush the idle barriers */ + + for_each_engine(engine, gt, id) { + intel_engine_pm_get(engine); + err = __live_idle_pulse(engine, intel_engine_flush_barriers); + intel_engine_pm_put(engine); + if (err) + break; + } + + return err; +} + +static int live_idle_pulse(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + /* Check that heartbeat pulses flush the idle barriers */ + + for_each_engine(engine, gt, id) { + intel_engine_pm_get(engine); + err = __live_idle_pulse(engine, intel_engine_pulse); + intel_engine_pm_put(engine); + if (err && err != -ENODEV) + break; + + err = 0; + } + + return err; +} + +int intel_heartbeat_live_selftests(struct drm_i915_private *i915) +{ + static const struct i915_subtest tests[] = { + SUBTEST(live_idle_flush), + SUBTEST(live_idle_pulse), + }; + int saved_hangcheck; + int err; + + if (intel_gt_is_wedged(&i915->gt)) + return 0; + + saved_hangcheck = i915_modparams.enable_hangcheck; + i915_modparams.enable_hangcheck = INT_MAX; + + err = intel_gt_live_subtests(tests, &i915->gt); + + i915_modparams.enable_hangcheck = saved_hangcheck; + return err; +} diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c index 7927b1a0c7a6..07d39f22a2c3 100644 --- a/drivers/gpu/drm/i915/i915_active.c +++ b/drivers/gpu/drm/i915/i915_active.c @@ -595,6 +595,7 @@ int i915_active_acquire_preallocate_barrier(struct i915_active *ref, struct llist_node *pos, *next; int err; + GEM_BUG_ON(i915_active_is_idle(ref)); GEM_BUG_ON(!llist_empty(&ref->preallocated_barriers)); /* diff --git a/drivers/gpu/drm/i915/i915_priolist_types.h b/drivers/gpu/drm/i915/i915_priolist_types.h index 21037a2e2038..ae8bb3cb627e 100644 --- a/drivers/gpu/drm/i915/i915_priolist_types.h +++ b/drivers/gpu/drm/i915/i915_priolist_types.h @@ -39,6 +39,7 @@ enum { * active request. */ #define I915_PRIORITY_UNPREEMPTABLE INT_MAX +#define I915_PRIORITY_BARRIER INT_MAX #define __NO_PREEMPTION (I915_PRIORITY_WAIT) diff --git a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h index 6daf6599ec79..00a063730bc3 100644 --- a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h +++ b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h @@ -17,6 +17,7 @@ selftest(gt_timelines, intel_timeline_live_selftests) selftest(gt_contexts, intel_context_live_selftests) selftest(gt_lrc, intel_lrc_live_selftests) selftest(gt_pm, intel_gt_pm_live_selftests) +selftest(gt_heartbeat, intel_heartbeat_live_selftests) selftest(requests, i915_request_live_selftests) selftest(active, i915_active_live_selftests) selftest(objects, i915_gem_object_live_selftests) -- 2.24.0.rc0 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx