On Wed, Jan 13, 2016 at 05:28:16PM +0000, Arun Siluvery wrote: > From: Tomas Elf <tomas.elf@xxxxxxxxx> > > With the per-engine hang recovery path already in place this patch adds > per-engine hang detection by letting the periodic hang checker detect hangs on > individual engines and communicate this to the error handler. During hang > checking every engine is checked and the hang detection status for each engine > is aggregated into a single 32-bit engine flag mask that contains all the > engine flags (1 << ring->id) of all the hung engines or'ed together. The > per-engine path in the error handler then sets up the hangcheck state for each > invidual, hung engine based on the engine flag mask before potentially calling > the per-engine hang recovery path. > > This allows the hang detection to happen in lock-step for all engines in > parallel and lets the driver process all hung engines in turn in the error > handler. > > Signed-off-by: Tomas Elf <tomas.elf@xxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_debugfs.c | 2 +- > drivers/gpu/drm/i915/i915_drv.h | 4 ++-- > drivers/gpu/drm/i915/i915_irq.c | 41 +++++++++++++++++++++++-------------- > 3 files changed, 29 insertions(+), 18 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c > index e3377ab..6d1b6c3 100644 > --- a/drivers/gpu/drm/i915/i915_debugfs.c > +++ b/drivers/gpu/drm/i915/i915_debugfs.c > @@ -4720,7 +4720,7 @@ i915_wedged_set(void *data, u64 val) > > intel_runtime_pm_get(dev_priv); > > - i915_handle_error(dev, val, > + i915_handle_error(dev, 0x0, val, > "Manually setting wedged to %llu", val); > > intel_runtime_pm_put(dev_priv); > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index e866f14..85cf692 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -2765,8 +2765,8 @@ static inline void i915_hangcheck_reinit(struct intel_engine_cs *engine) > > /* i915_irq.c */ > void i915_queue_hangcheck(struct drm_device *dev); > -__printf(3, 4) > -void i915_handle_error(struct drm_device *dev, bool wedged, > +__printf(4, 5) > +void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged, > const char *fmt, ...); > > extern void intel_irq_init(struct drm_i915_private *dev_priv); > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c > index 6a0ec37..fef74cf 100644 > --- a/drivers/gpu/drm/i915/i915_irq.c > +++ b/drivers/gpu/drm/i915/i915_irq.c > @@ -2712,15 +2712,29 @@ static void i915_report_and_clear_eir(struct drm_device *dev) > > /** > * i915_handle_error - handle a gpu error > - * @dev: drm device > + * @dev: drm device > + * @engine_mask: Bit mask containing the engine flags of all engines > + * associated with one or more detected errors. > + * May be 0x0. > + * If wedged is set to true this implies that one or more > + * engine hangs were detected. In this case we will > + * attempt to reset all engines that have been detected > + * as hung. > + * If a previous engine reset was attempted too recently > + * or if one of the current engine resets fails we fall > + * back to legacy full GPU reset. > + * @wedged: true = Hang detected, invoke hang recovery. These two look to be tautological. If wedged == 0, we expect engine_mask to be 0 zero as well since we will not be resetting the engines, just capturing error state. It's not though. Conversely if wedged == 1, we expect engine_mask to be set. Again, it's not and I think there the caller is wrong. > + * @fmt, ...: Error message describing reason for error. > * > * Do some basic checking of register state at error time and > * dump it to the syslog. Also call i915_capture_error_state() to make > * sure we get a record and make it available in debugfs. Fire a uevent > * so userspace knows something bad happened (should trigger collection > - * of a ring dump etc.). > + * of a ring dump etc.). If a hang was detected (wedged = true) try to > + * reset the associated engine. Failing that, try to fall back to legacy > + * full GPU reset recovery mode. > */ > -void i915_handle_error(struct drm_device *dev, bool wedged, > +void i915_handle_error(struct drm_device *dev, u32 engine_mask, bool wedged, > const char *fmt, ...) > { > struct drm_i915_private *dev_priv = dev->dev_private; > @@ -2729,12 +2743,6 @@ void i915_handle_error(struct drm_device *dev, bool wedged, > > struct intel_engine_cs *engine; > > - /* > - * NB: Placeholder until the hang checker supports > - * per-engine hang detection. > - */ > - u32 engine_mask = 0; > - > va_start(args, fmt); > vscnprintf(error_msg, sizeof(error_msg), fmt, args); > va_end(args); > @@ -3162,7 +3170,7 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd) > */ > tmp = I915_READ_CTL(ring); > if (tmp & RING_WAIT) { > - i915_handle_error(dev, false, > + i915_handle_error(dev, intel_ring_flag(ring), false, > "Kicking stuck wait on %s", > ring->name); > I915_WRITE_CTL(ring, tmp); > @@ -3174,7 +3182,7 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd) > default: > return HANGCHECK_HUNG; > case 1: > - i915_handle_error(dev, false, > + i915_handle_error(dev, intel_ring_flag(ring), false, > "Kicking stuck semaphore on %s", > ring->name); > I915_WRITE_CTL(ring, tmp); > @@ -3203,7 +3211,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work) > struct drm_device *dev = dev_priv->dev; > struct intel_engine_cs *ring; > int i; > - int busy_count = 0, rings_hung = 0; > + u32 engine_mask = 0; > + int busy_count = 0; > bool stuck[I915_NUM_RINGS] = { 0 }; > #define BUSY 1 > #define KICK 5 > @@ -3316,12 +3325,14 @@ static void i915_hangcheck_elapsed(struct work_struct *work) > DRM_INFO("%s on %s\n", > stuck[i] ? "stuck" : "no progress", > ring->name); > - rings_hung++; > + > + engine_mask |= intel_ring_flag(ring); > + ring->hangcheck.tdr_count++; tdr_count was introduced unused in the previous patch and here it has nothing to do with tdr, but just hangcheck pure and simple. It would actually be a useful statistic for hangcheck/error-state... -Chris -- Chris Wilson, Intel Open Source Technology Centre _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx