> -----Original Message----- > From: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Sent: Thursday, July 25, 2019 4:17 PM > To: intel-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>; Joonas Lahtinen > <joonas.lahtinen@xxxxxxxxxxxxxxx>; Ursulin, Tvrtko <tvrtko.ursulin@xxxxxxxxx>; > Bloomfield, Jon <jon.bloomfield@xxxxxxxxx> > Subject: [PATCH] drm/i915: Replace hangcheck by heartbeats > > Replace sampling the engine state every so often with a periodic > heartbeat request to measure the health of an engine. This is coupled > with the forced-preemption to allow long running requests to survive so > long as they do not block other users. Can you explain why we would need this at all if we have forced-preemption? Forced preemption guarantees that an engine cannot interfere with the timely execution of other contexts. If it hangs, but nothing else wants to use the engine then do we care? Power, obviously. But I'm not everything can be pre-empted. If a compute context is running on an engine, and no other contexts require that engine, then is it right to nuke it just because it's busy in a long running thread? > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Joonas Lahtinen <joonas.lahtinen@xxxxxxxxxxxxxxx> > Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > Cc: Jon Bloomfield <jon.bloomfield@xxxxxxxxx> > --- > Note, strictly this still requires struct_mutex-free retirement for the > corner case where the idle-worker is ineffective and we get a backlog of > requests on the kernel ring. Or if the legacy ringbuffer is full... > When we are able to retire from outside of struct_mutex we can do the > full idle-barrier and idle-work from here. > --- > drivers/gpu/drm/i915/Kconfig.profile | 11 + > drivers/gpu/drm/i915/Makefile | 2 +- > drivers/gpu/drm/i915/gem/i915_gem_pm.c | 2 - > drivers/gpu/drm/i915/gt/intel_engine.h | 32 -- > drivers/gpu/drm/i915/gt/intel_engine_cs.c | 10 +- > .../gpu/drm/i915/gt/intel_engine_heartbeat.c | 99 +++++ > .../gpu/drm/i915/gt/intel_engine_heartbeat.h | 17 + > drivers/gpu/drm/i915/gt/intel_engine_pm.c | 5 +- > drivers/gpu/drm/i915/gt/intel_engine_types.h | 14 +- > drivers/gpu/drm/i915/gt/intel_gt.c | 1 - > drivers/gpu/drm/i915/gt/intel_gt.h | 4 - > drivers/gpu/drm/i915/gt/intel_gt_pm.c | 2 - > drivers/gpu/drm/i915/gt/intel_gt_types.h | 9 - > drivers/gpu/drm/i915/gt/intel_hangcheck.c | 360 ------------------ > drivers/gpu/drm/i915/gt/intel_reset.c | 3 +- > drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 4 - > drivers/gpu/drm/i915/i915_debugfs.c | 86 ----- > drivers/gpu/drm/i915/i915_drv.c | 6 +- > drivers/gpu/drm/i915/i915_drv.h | 1 - > drivers/gpu/drm/i915/i915_gpu_error.c | 37 +- > drivers/gpu/drm/i915/i915_gpu_error.h | 2 - > drivers/gpu/drm/i915/i915_params.c | 5 - > drivers/gpu/drm/i915/i915_params.h | 1 - > 23 files changed, 151 insertions(+), 562 deletions(-) > create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c > create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h > delete mode 100644 drivers/gpu/drm/i915/gt/intel_hangcheck.c > > diff --git a/drivers/gpu/drm/i915/Kconfig.profile > b/drivers/gpu/drm/i915/Kconfig.profile > index 3184e8491333..aafb57f84169 100644 > --- a/drivers/gpu/drm/i915/Kconfig.profile > +++ b/drivers/gpu/drm/i915/Kconfig.profile > @@ -37,3 +37,14 @@ config DRM_I915_PREEMPT_TIMEOUT > to execute. > > May be 0 to disable the timeout. > + > +config DRM_I915_HEARTBEAT_INTERVAL > + int "Interval between heartbeat pulses (ms)" > + default 2500 # microseconds > + help > + While active the driver uses a periodic request, a heartbeat, to > + check the wellness of the GPU and to regularly flush state changes > + (idle barriers). > + > + May be 0 to disable heartbeats and therefore disable automatic GPU > + hang detection. > diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile > index 524516251a40..18201852d68d 100644 > --- a/drivers/gpu/drm/i915/Makefile > +++ b/drivers/gpu/drm/i915/Makefile > @@ -73,10 +73,10 @@ gt-y += \ > gt/intel_breadcrumbs.o \ > gt/intel_context.o \ > gt/intel_engine_cs.o \ > + gt/intel_engine_heartbeat.o \ > gt/intel_engine_pm.o \ > gt/intel_gt.o \ > gt/intel_gt_pm.o \ > - gt/intel_hangcheck.o \ > gt/intel_lrc.o \ > gt/intel_renderstate.o \ > gt/intel_reset.o \ > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pm.c > b/drivers/gpu/drm/i915/gem/i915_gem_pm.c > index b5561cbdc5ea..a5a0aefcc04c 100644 > --- a/drivers/gpu/drm/i915/gem/i915_gem_pm.c > +++ b/drivers/gpu/drm/i915/gem/i915_gem_pm.c > @@ -170,8 +170,6 @@ void i915_gem_suspend(struct drm_i915_private > *i915) > GEM_BUG_ON(i915->gt.awake); > flush_work(&i915->gem.idle_work); > > - cancel_delayed_work_sync(&i915->gt.hangcheck.work); > - > i915_gem_drain_freed_objects(i915); > > intel_uc_suspend(&i915->gt.uc); > diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h > b/drivers/gpu/drm/i915/gt/intel_engine.h > index db5c73ce86ee..38eeb4320c97 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine.h > +++ b/drivers/gpu/drm/i915/gt/intel_engine.h > @@ -90,38 +90,6 @@ struct drm_printer; > /* seqno size is actually only a uint32, but since we plan to use MI_FLUSH_DW > to > * do the writes, and that must have qw aligned offsets, simply pretend it's 8b. > */ > -enum intel_engine_hangcheck_action { > - ENGINE_IDLE = 0, > - ENGINE_WAIT, > - ENGINE_ACTIVE_SEQNO, > - ENGINE_ACTIVE_HEAD, > - ENGINE_ACTIVE_SUBUNITS, > - ENGINE_WAIT_KICK, > - ENGINE_DEAD, > -}; > - > -static inline const char * > -hangcheck_action_to_str(const enum intel_engine_hangcheck_action a) > -{ > - switch (a) { > - case ENGINE_IDLE: > - return "idle"; > - case ENGINE_WAIT: > - return "wait"; > - case ENGINE_ACTIVE_SEQNO: > - return "active seqno"; > - case ENGINE_ACTIVE_HEAD: > - return "active head"; > - case ENGINE_ACTIVE_SUBUNITS: > - return "active subunits"; > - case ENGINE_WAIT_KICK: > - return "wait kick"; > - case ENGINE_DEAD: > - return "dead"; > - } > - > - return "unknown"; > -} > > void intel_engines_set_scheduler_caps(struct drm_i915_private *i915); > > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c > b/drivers/gpu/drm/i915/gt/intel_engine_cs.c > index 65cbf1d9118d..89f4a3825b77 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c > +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c > @@ -621,7 +621,6 @@ static int intel_engine_setup_common(struct > intel_engine_cs *engine) > intel_engine_init_active(engine, ENGINE_PHYSICAL); > intel_engine_init_breadcrumbs(engine); > intel_engine_init_execlists(engine); > - intel_engine_init_hangcheck(engine); > intel_engine_init_batch_pool(engine); > intel_engine_init_cmd_parser(engine); > intel_engine_init__pm(engine); > @@ -1453,8 +1452,13 @@ void intel_engine_dump(struct intel_engine_cs > *engine, > drm_printf(m, "*** WEDGED ***\n"); > > drm_printf(m, "\tAwake? %d\n", atomic_read(&engine- > >wakeref.count)); > - drm_printf(m, "\tHangcheck: %d ms ago\n", > - jiffies_to_msecs(jiffies - engine- > >hangcheck.action_timestamp)); > + > + rcu_read_lock(); > + rq = READ_ONCE(engine->last_heartbeat); > + if (rq) > + drm_printf(m, "\tHeartbeat: %d ms ago\n", > + jiffies_to_msecs(jiffies - rq->emitted_jiffies)); > + rcu_read_unlock(); > drm_printf(m, "\tReset count: %d (global %d)\n", > i915_reset_engine_count(error, engine), > i915_reset_count(error)); > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c > b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c > new file mode 100644 > index 000000000000..42330b1074e6 > --- /dev/null > +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c > @@ -0,0 +1,99 @@ > +/* > + * SPDX-License-Identifier: MIT > + * > + * Copyright © 2019 Intel Corporation > + */ > + > +#include "i915_request.h" > + > +#include "intel_context.h" > +#include "intel_engine_heartbeat.h" > +#include "intel_engine_pm.h" > +#include "intel_gt.h" > +#include "intel_reset.h" > + > +static long delay(void) > +{ > + const long t = > msecs_to_jiffies(CONFIG_DRM_I915_HEARTBEAT_INTERVAL); > + > + return round_jiffies_up_relative(t); > +} > + > +static void heartbeat(struct work_struct *wrk) > +{ > + struct intel_engine_cs *engine = > + container_of(wrk, typeof(*engine), heartbeat.work); > + struct intel_context *ce = engine->kernel_context; > + struct i915_request *rq; > + > + if (!intel_engine_pm_get_if_awake(engine)) > + return; > + > + rq = engine->last_heartbeat; > + if (rq && i915_request_completed(rq)) { > + i915_request_put(rq); > + engine->last_heartbeat = NULL; > + } > + > + if (intel_gt_is_wedged(engine->gt)) > + goto out; > + > + if (engine->last_heartbeat) { > + if (engine->schedule && rq->sched.attr.priority < INT_MAX) { > + struct i915_sched_attr attr = { .priority = INT_MAX }; > + > + local_bh_disable(); > + engine->schedule(rq, &attr); > + local_bh_enable(); > + } else { > + intel_gt_handle_error(engine->gt, engine->mask, > + I915_ERROR_CAPTURE, > + "stopped heartbeat on %s", > + engine->name); > + } > + goto out; > + } > + > + if (engine->wakeref_serial == engine->serial) > + goto out; > + > + if (intel_context_timeline_lock(ce)) > + goto out; > + > + intel_context_enter(ce); > + rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN); > + intel_context_exit(ce); > + if (IS_ERR(rq)) > + goto unlock; > + > + engine->last_heartbeat = i915_request_get(rq); > + engine->wakeref_serial = engine->serial + 1; > + > + i915_request_add_barriers(rq); > + __i915_request_commit(rq); > + > +unlock: > + intel_context_timeline_unlock(ce); > +out: > + schedule_delayed_work(&engine->heartbeat, delay()); > + intel_engine_pm_put(engine); > +} > + > +void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine) > +{ > + if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL) > + return; > + > + schedule_delayed_work(&engine->heartbeat, delay()); > +} > + > +void intel_engine_park_heartbeat(struct intel_engine_cs *engine) > +{ > + cancel_delayed_work_sync(&engine->heartbeat); > + i915_request_put(fetch_and_zero(&engine->last_heartbeat)); > +} > + > +void intel_engine_init_heartbeat(struct intel_engine_cs *engine) > +{ > + INIT_DELAYED_WORK(&engine->heartbeat, heartbeat); > +} > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h > b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h > new file mode 100644 > index 000000000000..e04f9c7aa85d > --- /dev/null > +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h > @@ -0,0 +1,17 @@ > +/* > + * SPDX-License-Identifier: MIT > + * > + * Copyright © 2019 Intel Corporation > + */ > + > +#ifndef INTEL_ENGINE_HEARTBEAT_H > +#define INTEL_ENGINE_HEARTBEAT_H > + > +struct intel_engine_cs; > + > +void intel_engine_init_heartbeat(struct intel_engine_cs *engine); > + > +void intel_engine_park_heartbeat(struct intel_engine_cs *engine); > +void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine); > + > +#endif /* INTEL_ENGINE_HEARTBEAT_H */ > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c > b/drivers/gpu/drm/i915/gt/intel_engine_pm.c > index e74fbf04a68d..6349bb038c72 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c > +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c > @@ -7,6 +7,7 @@ > #include "i915_drv.h" > > #include "intel_engine.h" > +#include "intel_engine_heartbeat.h" > #include "intel_engine_pm.h" > #include "intel_gt.h" > #include "intel_gt_pm.h" > @@ -32,7 +33,7 @@ static int __engine_unpark(struct intel_wakeref *wf) > if (engine->unpark) > engine->unpark(engine); > > - intel_engine_init_hangcheck(engine); > + intel_engine_unpark_heartbeat(engine); > return 0; > } > > @@ -115,6 +116,7 @@ static int __engine_park(struct intel_wakeref *wf) > > GEM_TRACE("%s\n", engine->name); > > + intel_engine_park_heartbeat(engine); > intel_engine_disarm_breadcrumbs(engine); > > /* Must be reset upon idling, or we may miss the busy wakeup. */ > @@ -142,4 +144,5 @@ void intel_engine_pm_put(struct intel_engine_cs > *engine) > void intel_engine_init__pm(struct intel_engine_cs *engine) > { > intel_wakeref_init(&engine->wakeref); > + intel_engine_init_heartbeat(engine); > } > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h > b/drivers/gpu/drm/i915/gt/intel_engine_types.h > index 8be63019d707..9a3ead556743 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h > +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h > @@ -14,6 +14,7 @@ > #include <linux/llist.h> > #include <linux/timer.h> > #include <linux/types.h> > +#include <linux/workqueue.h> > > #include "i915_gem.h" > #include "i915_gem_batch_pool.h" > @@ -55,14 +56,6 @@ struct intel_instdone { > u32 row[I915_MAX_SLICES][I915_MAX_SUBSLICES]; > }; > > -struct intel_engine_hangcheck { > - u64 acthd; > - u32 last_ring; > - u32 last_head; > - unsigned long action_timestamp; > - struct intel_instdone instdone; > -}; > - > struct intel_ring { > struct kref ref; > struct i915_vma *vma; > @@ -298,6 +291,9 @@ struct intel_engine_cs { > struct drm_i915_gem_object *default_state; > void *pinned_default_state; > > + struct delayed_work heartbeat; > + struct i915_request *last_heartbeat; > + > /* Rather than have every client wait upon all user interrupts, > * with the herd waking after every interrupt and each doing the > * heavyweight seqno dance, we delegate the task (of being the > @@ -437,8 +433,6 @@ struct intel_engine_cs { > /* status_notifier: list of callbacks for context-switch changes */ > struct atomic_notifier_head context_status_notifier; > > - struct intel_engine_hangcheck hangcheck; > - > #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0) > #define I915_ENGINE_SUPPORTS_STATS BIT(1) > #define I915_ENGINE_HAS_PREEMPTION BIT(2) > diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c > b/drivers/gpu/drm/i915/gt/intel_gt.c > index f7e69db4019d..feaf916c9abd 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gt.c > +++ b/drivers/gpu/drm/i915/gt/intel_gt.c > @@ -19,7 +19,6 @@ void intel_gt_init_early(struct intel_gt *gt, struct > drm_i915_private *i915) > > spin_lock_init(>->closed_lock); > > - intel_gt_init_hangcheck(gt); > intel_gt_init_reset(gt); > intel_gt_pm_init_early(gt); > } > diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h > b/drivers/gpu/drm/i915/gt/intel_gt.h > index 640bb0531f5b..54d22913ede3 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gt.h > +++ b/drivers/gpu/drm/i915/gt/intel_gt.h > @@ -39,8 +39,6 @@ void intel_gt_clear_error_registers(struct intel_gt *gt, > void intel_gt_flush_ggtt_writes(struct intel_gt *gt); > void intel_gt_chipset_flush(struct intel_gt *gt); > > -void intel_gt_init_hangcheck(struct intel_gt *gt); > - > int intel_gt_init_scratch(struct intel_gt *gt, unsigned int size); > void intel_gt_fini_scratch(struct intel_gt *gt); > > @@ -55,6 +53,4 @@ static inline bool intel_gt_is_wedged(struct intel_gt *gt) > return __intel_reset_failed(>->reset); > } > > -void intel_gt_queue_hangcheck(struct intel_gt *gt); > - > #endif /* __INTEL_GT_H__ */ > diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c > b/drivers/gpu/drm/i915/gt/intel_gt_pm.c > index 65c0d0c9d543..d223b07ee324 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c > +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c > @@ -46,8 +46,6 @@ static int intel_gt_unpark(struct intel_wakeref *wf) > > i915_pmu_gt_unparked(i915); > > - intel_gt_queue_hangcheck(gt); > - > pm_notify(i915, INTEL_GT_UNPARK); > > return 0; > diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h > b/drivers/gpu/drm/i915/gt/intel_gt_types.h > index 34d4a868e4f1..0112bb98cbf3 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h > +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h > @@ -23,14 +23,6 @@ struct drm_i915_private; > struct i915_ggtt; > struct intel_uncore; > > -struct intel_hangcheck { > - /* For hangcheck timer */ > -#define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */ > -#define DRM_I915_HANGCHECK_JIFFIES > msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD) > - > - struct delayed_work work; > -}; > - > struct intel_gt { > struct drm_i915_private *i915; > struct intel_uncore *uncore; > @@ -54,7 +46,6 @@ struct intel_gt { > struct list_head closed_vma; > spinlock_t closed_lock; /* guards the list of closed_vma */ > > - struct intel_hangcheck hangcheck; > struct intel_reset reset; > > /** > diff --git a/drivers/gpu/drm/i915/gt/intel_hangcheck.c > b/drivers/gpu/drm/i915/gt/intel_hangcheck.c > deleted file mode 100644 > index 05d042cdefe2..000000000000 > --- a/drivers/gpu/drm/i915/gt/intel_hangcheck.c > +++ /dev/null > @@ -1,360 +0,0 @@ > -/* > - * Copyright © 2016 Intel Corporation > - * > - * Permission is hereby granted, free of charge, to any person obtaining a > - * copy of this software and associated documentation files (the "Software"), > - * to deal in the Software without restriction, including without limitation > - * the rights to use, copy, modify, merge, publish, distribute, sublicense, > - * and/or sell copies of the Software, and to permit persons to whom the > - * Software is furnished to do so, subject to the following conditions: > - * > - * The above copyright notice and this permission notice (including the next > - * paragraph) shall be included in all copies or substantial portions of the > - * Software. > - * > - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, > EXPRESS OR > - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF > MERCHANTABILITY, > - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO > EVENT SHALL > - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, > DAMAGES OR OTHER > - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, > ARISING > - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR > OTHER DEALINGS > - * IN THE SOFTWARE. > - * > - */ > - > -#include "i915_drv.h" > -#include "intel_engine.h" > -#include "intel_gt.h" > -#include "intel_reset.h" > - > -struct hangcheck { > - u64 acthd; > - u32 ring; > - u32 head; > - enum intel_engine_hangcheck_action action; > - unsigned long action_timestamp; > - int deadlock; > - struct intel_instdone instdone; > - bool wedged:1; > - bool stalled:1; > -}; > - > -static bool instdone_unchanged(u32 current_instdone, u32 *old_instdone) > -{ > - u32 tmp = current_instdone | *old_instdone; > - bool unchanged; > - > - unchanged = tmp == *old_instdone; > - *old_instdone |= tmp; > - > - return unchanged; > -} > - > -static bool subunits_stuck(struct intel_engine_cs *engine) > -{ > - struct drm_i915_private *dev_priv = engine->i915; > - struct intel_instdone instdone; > - struct intel_instdone *accu_instdone = &engine->hangcheck.instdone; > - bool stuck; > - int slice; > - int subslice; > - > - intel_engine_get_instdone(engine, &instdone); > - > - /* There might be unstable subunit states even when > - * actual head is not moving. Filter out the unstable ones by > - * accumulating the undone -> done transitions and only > - * consider those as progress. > - */ > - stuck = instdone_unchanged(instdone.instdone, > - &accu_instdone->instdone); > - stuck &= instdone_unchanged(instdone.slice_common, > - &accu_instdone->slice_common); > - > - for_each_instdone_slice_subslice(dev_priv, slice, subslice) { > - stuck &= > instdone_unchanged(instdone.sampler[slice][subslice], > - &accu_instdone- > >sampler[slice][subslice]); > - stuck &= instdone_unchanged(instdone.row[slice][subslice], > - &accu_instdone- > >row[slice][subslice]); > - } > - > - return stuck; > -} > - > -static enum intel_engine_hangcheck_action > -head_stuck(struct intel_engine_cs *engine, u64 acthd) > -{ > - if (acthd != engine->hangcheck.acthd) { > - > - /* Clear subunit states on head movement */ > - memset(&engine->hangcheck.instdone, 0, > - sizeof(engine->hangcheck.instdone)); > - > - return ENGINE_ACTIVE_HEAD; > - } > - > - if (!subunits_stuck(engine)) > - return ENGINE_ACTIVE_SUBUNITS; > - > - return ENGINE_DEAD; > -} > - > -static enum intel_engine_hangcheck_action > -engine_stuck(struct intel_engine_cs *engine, u64 acthd) > -{ > - enum intel_engine_hangcheck_action ha; > - u32 tmp; > - > - ha = head_stuck(engine, acthd); > - if (ha != ENGINE_DEAD) > - return ha; > - > - if (IS_GEN(engine->i915, 2)) > - return ENGINE_DEAD; > - > - /* Is the chip hanging on a WAIT_FOR_EVENT? > - * If so we can simply poke the RB_WAIT bit > - * and break the hang. This should work on > - * all but the second generation chipsets. > - */ > - tmp = ENGINE_READ(engine, RING_CTL); > - if (tmp & RING_WAIT) { > - intel_gt_handle_error(engine->gt, engine->mask, 0, > - "stuck wait on %s", engine->name); > - ENGINE_WRITE(engine, RING_CTL, tmp); > - return ENGINE_WAIT_KICK; > - } > - > - return ENGINE_DEAD; > -} > - > -static void hangcheck_load_sample(struct intel_engine_cs *engine, > - struct hangcheck *hc) > -{ > - hc->acthd = intel_engine_get_active_head(engine); > - hc->ring = ENGINE_READ(engine, RING_START); > - hc->head = ENGINE_READ(engine, RING_HEAD); > -} > - > -static void hangcheck_store_sample(struct intel_engine_cs *engine, > - const struct hangcheck *hc) > -{ > - engine->hangcheck.acthd = hc->acthd; > - engine->hangcheck.last_ring = hc->ring; > - engine->hangcheck.last_head = hc->head; > -} > - > -static enum intel_engine_hangcheck_action > -hangcheck_get_action(struct intel_engine_cs *engine, > - const struct hangcheck *hc) > -{ > - if (intel_engine_is_idle(engine)) > - return ENGINE_IDLE; > - > - if (engine->hangcheck.last_ring != hc->ring) > - return ENGINE_ACTIVE_SEQNO; > - > - if (engine->hangcheck.last_head != hc->head) > - return ENGINE_ACTIVE_SEQNO; > - > - return engine_stuck(engine, hc->acthd); > -} > - > -static void hangcheck_accumulate_sample(struct intel_engine_cs *engine, > - struct hangcheck *hc) > -{ > - unsigned long timeout = I915_ENGINE_DEAD_TIMEOUT; > - > - hc->action = hangcheck_get_action(engine, hc); > - > - /* We always increment the progress > - * if the engine is busy and still processing > - * the same request, so that no single request > - * can run indefinitely (such as a chain of > - * batches). The only time we do not increment > - * the hangcheck score on this ring, if this > - * engine is in a legitimate wait for another > - * engine. In that case the waiting engine is a > - * victim and we want to be sure we catch the > - * right culprit. Then every time we do kick > - * the ring, make it as a progress as the seqno > - * advancement might ensure and if not, it > - * will catch the hanging engine. > - */ > - > - switch (hc->action) { > - case ENGINE_IDLE: > - case ENGINE_ACTIVE_SEQNO: > - /* Clear head and subunit states on seqno movement */ > - hc->acthd = 0; > - > - memset(&engine->hangcheck.instdone, 0, > - sizeof(engine->hangcheck.instdone)); > - > - /* Intentional fall through */ > - case ENGINE_WAIT_KICK: > - case ENGINE_WAIT: > - engine->hangcheck.action_timestamp = jiffies; > - break; > - > - case ENGINE_ACTIVE_HEAD: > - case ENGINE_ACTIVE_SUBUNITS: > - /* > - * Seqno stuck with still active engine gets leeway, > - * in hopes that it is just a long shader. > - */ > - timeout = I915_SEQNO_DEAD_TIMEOUT; > - break; > - > - case ENGINE_DEAD: > - break; > - > - default: > - MISSING_CASE(hc->action); > - } > - > - hc->stalled = time_after(jiffies, > - engine->hangcheck.action_timestamp + > timeout); > - hc->wedged = time_after(jiffies, > - engine->hangcheck.action_timestamp + > - I915_ENGINE_WEDGED_TIMEOUT); > -} > - > -static void hangcheck_declare_hang(struct intel_gt *gt, > - intel_engine_mask_t hung, > - intel_engine_mask_t stuck) > -{ > - struct intel_engine_cs *engine; > - intel_engine_mask_t tmp; > - char msg[80]; > - int len; > - > - /* If some rings hung but others were still busy, only > - * blame the hanging rings in the synopsis. > - */ > - if (stuck != hung) > - hung &= ~stuck; > - len = scnprintf(msg, sizeof(msg), > - "%s on ", stuck == hung ? "no progress" : "hang"); > - for_each_engine_masked(engine, gt->i915, hung, tmp) > - len += scnprintf(msg + len, sizeof(msg) - len, > - "%s, ", engine->name); > - msg[len-2] = '\0'; > - > - return intel_gt_handle_error(gt, hung, I915_ERROR_CAPTURE, "%s", > msg); > -} > - > -/* > - * This is called when the chip hasn't reported back with completed > - * batchbuffers in a long time. We keep track per ring seqno progress and > - * if there are no progress, hangcheck score for that ring is increased. > - * Further, acthd is inspected to see if the ring is stuck. On stuck case > - * we kick the ring. If we see no progress on three subsequent calls > - * we assume chip is wedged and try to fix it by resetting the chip. > - */ > -static void hangcheck_elapsed(struct work_struct *work) > -{ > - struct intel_gt *gt = > - container_of(work, typeof(*gt), hangcheck.work.work); > - intel_engine_mask_t hung = 0, stuck = 0, wedged = 0; > - struct intel_engine_cs *engine; > - enum intel_engine_id id; > - intel_wakeref_t wakeref; > - > - if (!i915_modparams.enable_hangcheck) > - return; > - > - if (!READ_ONCE(gt->awake)) > - return; > - > - if (intel_gt_is_wedged(gt)) > - return; > - > - wakeref = intel_runtime_pm_get_if_in_use(>->i915->runtime_pm); > - if (!wakeref) > - return; > - > - /* As enabling the GPU requires fairly extensive mmio access, > - * periodically arm the mmio checker to see if we are triggering > - * any invalid access. > - */ > - intel_uncore_arm_unclaimed_mmio_detection(gt->uncore); > - > - for_each_engine(engine, gt->i915, id) { > - struct hangcheck hc; > - > - intel_engine_signal_breadcrumbs(engine); > - > - hangcheck_load_sample(engine, &hc); > - hangcheck_accumulate_sample(engine, &hc); > - hangcheck_store_sample(engine, &hc); > - > - if (hc.stalled) { > - hung |= engine->mask; > - if (hc.action != ENGINE_DEAD) > - stuck |= engine->mask; > - } > - > - if (hc.wedged) > - wedged |= engine->mask; > - } > - > - if (GEM_SHOW_DEBUG() && (hung | stuck)) { > - struct drm_printer p = drm_debug_printer("hangcheck"); > - > - for_each_engine(engine, gt->i915, id) { > - if (intel_engine_is_idle(engine)) > - continue; > - > - intel_engine_dump(engine, &p, "%s\n", engine- > >name); > - } > - } > - > - if (wedged) { > - dev_err(gt->i915->drm.dev, > - "GPU recovery timed out," > - " cancelling all in-flight rendering.\n"); > - GEM_TRACE_DUMP(); > - intel_gt_set_wedged(gt); > - } > - > - if (hung) > - hangcheck_declare_hang(gt, hung, stuck); > - > - intel_runtime_pm_put(>->i915->runtime_pm, wakeref); > - > - /* Reset timer in case GPU hangs without another request being added > */ > - intel_gt_queue_hangcheck(gt); > -} > - > -void intel_gt_queue_hangcheck(struct intel_gt *gt) > -{ > - unsigned long delay; > - > - if (unlikely(!i915_modparams.enable_hangcheck)) > - return; > - > - /* > - * Don't continually defer the hangcheck so that it is always run at > - * least once after work has been scheduled on any ring. Otherwise, > - * we will ignore a hung ring if a second ring is kept busy. > - */ > - > - delay = round_jiffies_up_relative(DRM_I915_HANGCHECK_JIFFIES); > - queue_delayed_work(system_long_wq, >->hangcheck.work, delay); > -} > - > -void intel_engine_init_hangcheck(struct intel_engine_cs *engine) > -{ > - memset(&engine->hangcheck, 0, sizeof(engine->hangcheck)); > - engine->hangcheck.action_timestamp = jiffies; > -} > - > -void intel_gt_init_hangcheck(struct intel_gt *gt) > -{ > - INIT_DELAYED_WORK(>->hangcheck.work, hangcheck_elapsed); > -} > - > -#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) > -#include "selftest_hangcheck.c" > -#endif > diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c > b/drivers/gpu/drm/i915/gt/intel_reset.c > index 98c071fe532b..0cb457538e62 100644 > --- a/drivers/gpu/drm/i915/gt/intel_reset.c > +++ b/drivers/gpu/drm/i915/gt/intel_reset.c > @@ -978,8 +978,6 @@ void intel_gt_reset(struct intel_gt *gt, > if (ret) > goto taint; > > - intel_gt_queue_hangcheck(gt); > - > finish: > reset_finish(gt, awake); > unlock: > @@ -1305,4 +1303,5 @@ void __intel_fini_wedge(struct intel_wedge_me *w) > > #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) > #include "selftest_reset.c" > +#include "selftest_hangcheck.c" > #endif > diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c > b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c > index e2fa38a1ff0f..9d64390966e3 100644 > --- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c > +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c > @@ -1732,7 +1732,6 @@ int intel_hangcheck_live_selftests(struct > drm_i915_private *i915) > }; > struct intel_gt *gt = &i915->gt; > intel_wakeref_t wakeref; > - bool saved_hangcheck; > int err; > > if (!intel_has_gpu_reset(gt->i915)) > @@ -1742,8 +1741,6 @@ int intel_hangcheck_live_selftests(struct > drm_i915_private *i915) > return -EIO; /* we're long past hope of a successful reset */ > > wakeref = intel_runtime_pm_get(>->i915->runtime_pm); > - saved_hangcheck = > fetch_and_zero(&i915_modparams.enable_hangcheck); > - drain_delayed_work(>->hangcheck.work); /* flush param */ > > err = intel_gt_live_subtests(tests, gt); > > @@ -1751,7 +1748,6 @@ int intel_hangcheck_live_selftests(struct > drm_i915_private *i915) > igt_flush_test(gt->i915, I915_WAIT_LOCKED); > mutex_unlock(>->i915->drm.struct_mutex); > > - i915_modparams.enable_hangcheck = saved_hangcheck; > intel_runtime_pm_put(>->i915->runtime_pm, wakeref); > > return err; > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c > b/drivers/gpu/drm/i915/i915_debugfs.c > index 24787bb48c9f..1f0408ff345b 100644 > --- a/drivers/gpu/drm/i915/i915_debugfs.c > +++ b/drivers/gpu/drm/i915/i915_debugfs.c > @@ -1044,91 +1044,6 @@ static int i915_frequency_info(struct seq_file *m, > void *unused) > return ret; > } > > -static void i915_instdone_info(struct drm_i915_private *dev_priv, > - struct seq_file *m, > - struct intel_instdone *instdone) > -{ > - int slice; > - int subslice; > - > - seq_printf(m, "\t\tINSTDONE: 0x%08x\n", > - instdone->instdone); > - > - if (INTEL_GEN(dev_priv) <= 3) > - return; > - > - seq_printf(m, "\t\tSC_INSTDONE: 0x%08x\n", > - instdone->slice_common); > - > - if (INTEL_GEN(dev_priv) <= 6) > - return; > - > - for_each_instdone_slice_subslice(dev_priv, slice, subslice) > - seq_printf(m, "\t\tSAMPLER_INSTDONE[%d][%d]: 0x%08x\n", > - slice, subslice, instdone->sampler[slice][subslice]); > - > - for_each_instdone_slice_subslice(dev_priv, slice, subslice) > - seq_printf(m, "\t\tROW_INSTDONE[%d][%d]: 0x%08x\n", > - slice, subslice, instdone->row[slice][subslice]); > -} > - > -static int i915_hangcheck_info(struct seq_file *m, void *unused) > -{ > - struct drm_i915_private *i915 = node_to_i915(m->private); > - struct intel_gt *gt = &i915->gt; > - struct intel_engine_cs *engine; > - intel_wakeref_t wakeref; > - enum intel_engine_id id; > - > - seq_printf(m, "Reset flags: %lx\n", gt->reset.flags); > - if (test_bit(I915_WEDGED, >->reset.flags)) > - seq_puts(m, "\tWedged\n"); > - if (test_bit(I915_RESET_BACKOFF, >->reset.flags)) > - seq_puts(m, "\tDevice (global) reset in progress\n"); > - > - if (!i915_modparams.enable_hangcheck) { > - seq_puts(m, "Hangcheck disabled\n"); > - return 0; > - } > - > - if (timer_pending(>->hangcheck.work.timer)) > - seq_printf(m, "Hangcheck active, timer fires in %dms\n", > - jiffies_to_msecs(gt->hangcheck.work.timer.expires - > - jiffies)); > - else if (delayed_work_pending(>->hangcheck.work)) > - seq_puts(m, "Hangcheck active, work pending\n"); > - else > - seq_puts(m, "Hangcheck inactive\n"); > - > - seq_printf(m, "GT active? %s\n", yesno(gt->awake)); > - > - with_intel_runtime_pm(&i915->runtime_pm, wakeref) { > - for_each_engine(engine, i915, id) { > - struct intel_instdone instdone; > - > - seq_printf(m, "%s: %d ms ago\n", > - engine->name, > - jiffies_to_msecs(jiffies - > - engine- > >hangcheck.action_timestamp)); > - > - seq_printf(m, "\tACTHD = 0x%08llx [current > 0x%08llx]\n", > - (long long)engine->hangcheck.acthd, > - intel_engine_get_active_head(engine)); > - > - intel_engine_get_instdone(engine, &instdone); > - > - seq_puts(m, "\tinstdone read =\n"); > - i915_instdone_info(i915, m, &instdone); > - > - seq_puts(m, "\tinstdone accu =\n"); > - i915_instdone_info(i915, m, > - &engine->hangcheck.instdone); > - } > - } > - > - return 0; > -} > - > static int ironlake_drpc_info(struct seq_file *m) > { > struct drm_i915_private *i915 = node_to_i915(m->private); > @@ -4387,7 +4302,6 @@ static const struct drm_info_list i915_debugfs_list[] = > { > {"i915_guc_stage_pool", i915_guc_stage_pool, 0}, > {"i915_huc_load_status", i915_huc_load_status_info, 0}, > {"i915_frequency_info", i915_frequency_info, 0}, > - {"i915_hangcheck_info", i915_hangcheck_info, 0}, > {"i915_drpc_info", i915_drpc_info, 0}, > {"i915_emon_status", i915_emon_status, 0}, > {"i915_ring_freq_table", i915_ring_freq_table, 0}, > diff --git a/drivers/gpu/drm/i915/i915_drv.c > b/drivers/gpu/drm/i915/i915_drv.c > index f2d3d754af37..8c277fbf9663 100644 > --- a/drivers/gpu/drm/i915/i915_drv.c > +++ b/drivers/gpu/drm/i915/i915_drv.c > @@ -411,8 +411,7 @@ static int i915_getparam_ioctl(struct drm_device *dev, > void *data, > return -ENODEV; > break; > case I915_PARAM_HAS_GPU_RESET: > - value = i915_modparams.enable_hangcheck && > - intel_has_gpu_reset(dev_priv); > + value = intel_has_gpu_reset(dev_priv); > if (value && intel_has_reset_engine(dev_priv)) > value = 2; > break; > @@ -1975,10 +1974,7 @@ void i915_driver_remove(struct drm_device *dev) > > intel_csr_ucode_fini(dev_priv); > > - /* Free error state after interrupts are fully disabled. */ > - cancel_delayed_work_sync(&dev_priv->gt.hangcheck.work); > i915_reset_error_state(dev_priv); > - > i915_gem_driver_remove(dev_priv); > > intel_power_domains_driver_remove(dev_priv); > diff --git a/drivers/gpu/drm/i915/i915_drv.h > b/drivers/gpu/drm/i915/i915_drv.h > index 59d4a1146039..0e715b1266f0 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -2375,7 +2375,6 @@ extern const struct dev_pm_ops i915_pm_ops; > int i915_driver_probe(struct pci_dev *pdev, const struct pci_device_id *ent); > void i915_driver_remove(struct drm_device *dev); > > -void intel_engine_init_hangcheck(struct intel_engine_cs *engine); > int vlv_force_gfx_clock(struct drm_i915_private *dev_priv, bool on); > > static inline bool intel_gvt_active(struct drm_i915_private *dev_priv) > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c > b/drivers/gpu/drm/i915/i915_gpu_error.c > index 674d341a23f6..eff10ca5a788 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.c > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c > @@ -554,10 +554,6 @@ static void error_print_engine(struct > drm_i915_error_state_buf *m, > } > err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head); > err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail); > - err_printf(m, " hangcheck timestamp: %dms (%lu%s)\n", > - jiffies_to_msecs(ee->hangcheck_timestamp - epoch), > - ee->hangcheck_timestamp, > - ee->hangcheck_timestamp == epoch ? "; epoch" : ""); > err_printf(m, " engine reset count: %u\n", ee->reset_count); > > for (n = 0; n < ee->num_ports; n++) { > @@ -695,11 +691,8 @@ static void __err_print_to_sgl(struct > drm_i915_error_state_buf *m, > ts = ktime_to_timespec64(error->uptime); > err_printf(m, "Uptime: %lld s %ld us\n", > (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); > - err_printf(m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ); > - err_printf(m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n", > - error->capture, > - jiffies_to_msecs(jiffies - error->capture), > - jiffies_to_msecs(error->capture - error->epoch)); > + err_printf(m, "Capture: %lu jiffies; %d ms ago\n", > + error->capture, jiffies_to_msecs(jiffies - error->capture)); > > for (i = 0; i < ARRAY_SIZE(error->engine); i++) { > if (!error->engine[i].context.pid) > @@ -760,7 +753,9 @@ static void __err_print_to_sgl(struct > drm_i915_error_state_buf *m, > > for (i = 0; i < ARRAY_SIZE(error->engine); i++) { > if (error->engine[i].engine_id != -1) > - error_print_engine(m, &error->engine[i], error- > >epoch); > + error_print_engine(m, > + &error->engine[i], > + error->capture); > } > > for (i = 0; i < ARRAY_SIZE(error->engine); i++) { > @@ -790,7 +785,7 @@ static void __err_print_to_sgl(struct > drm_i915_error_state_buf *m, > for (j = 0; j < ee->num_requests; j++) > error_print_request(m, " ", > &ee->requests[j], > - error->epoch); > + error->capture); > } > > print_error_obj(m, m->i915->engine[i], > @@ -1171,8 +1166,6 @@ static void error_record_engine_registers(struct > i915_gpu_state *error, > } > > ee->idle = intel_engine_is_idle(engine); > - if (!ee->idle) > - ee->hangcheck_timestamp = engine- > >hangcheck.action_timestamp; > ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error, > engine); > > @@ -1673,22 +1666,6 @@ static void capture_params(struct i915_gpu_state > *error) > i915_params_copy(&error->params, &i915_modparams); > } > > -static unsigned long capture_find_epoch(const struct i915_gpu_state *error) > -{ > - unsigned long epoch = error->capture; > - int i; > - > - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { > - const struct drm_i915_error_engine *ee = &error->engine[i]; > - > - if (ee->hangcheck_timestamp && > - time_before(ee->hangcheck_timestamp, epoch)) > - epoch = ee->hangcheck_timestamp; > - } > - > - return epoch; > -} > - > static void capture_finish(struct i915_gpu_state *error) > { > struct i915_ggtt *ggtt = &error->i915->ggtt; > @@ -1740,8 +1717,6 @@ i915_capture_gpu_state(struct drm_i915_private > *i915) > error->overlay = intel_overlay_capture_error_state(i915); > error->display = intel_display_capture_error_state(i915); > > - error->epoch = capture_find_epoch(error); > - > capture_finish(error); > compress_fini(&compress); > > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h > b/drivers/gpu/drm/i915/i915_gpu_error.h > index a24c35107d16..c4ba297ca862 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.h > +++ b/drivers/gpu/drm/i915/i915_gpu_error.h > @@ -34,7 +34,6 @@ struct i915_gpu_state { > ktime_t boottime; > ktime_t uptime; > unsigned long capture; > - unsigned long epoch; > > struct drm_i915_private *i915; > > @@ -84,7 +83,6 @@ struct i915_gpu_state { > int engine_id; > /* Software tracked state */ > bool idle; > - unsigned long hangcheck_timestamp; > int num_requests; > u32 reset_count; > > diff --git a/drivers/gpu/drm/i915/i915_params.c > b/drivers/gpu/drm/i915/i915_params.c > index 296452f9efe4..e4cd58e90e40 100644 > --- a/drivers/gpu/drm/i915/i915_params.c > +++ b/drivers/gpu/drm/i915/i915_params.c > @@ -77,11 +77,6 @@ i915_param_named(error_capture, bool, 0600, > "triaging and debugging hangs."); > #endif > > -i915_param_named_unsafe(enable_hangcheck, bool, 0600, > - "Periodically check GPU activity for detecting hangs. " > - "WARNING: Disabling this can cause system wide hangs. " > - "(default: true)"); > - > i915_param_named_unsafe(enable_psr, int, 0600, > "Enable PSR " > "(0=disabled, 1=enabled) " > diff --git a/drivers/gpu/drm/i915/i915_params.h > b/drivers/gpu/drm/i915/i915_params.h > index d29ade3b7de6..9a3359c6f6f3 100644 > --- a/drivers/gpu/drm/i915/i915_params.h > +++ b/drivers/gpu/drm/i915/i915_params.h > @@ -68,7 +68,6 @@ struct drm_printer; > param(char *, force_probe, CONFIG_DRM_I915_FORCE_PROBE) \ > /* leave bools at the end to not create holes */ \ > param(bool, alpha_support, > IS_ENABLED(CONFIG_DRM_I915_ALPHA_SUPPORT)) \ > - param(bool, enable_hangcheck, true) \ > param(bool, prefault_disable, false) \ > param(bool, load_detect_test, false) \ > param(bool, force_reset_modeset_test, false) \ > -- > 2.22.0 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx