On Fri, Mar 24, 2017 at 06:30:07PM -0700, Michel Thierry wrote: > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c > index 87e76ef589b1..d484cbc561eb 100644 > --- a/drivers/gpu/drm/i915/i915_irq.c > +++ b/drivers/gpu/drm/i915/i915_irq.c > @@ -1369,6 +1369,10 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift) > > if (tasklet) > tasklet_hi_schedule(&engine->irq_tasklet); > + > + if (iir & (GT_GEN8_WATCHDOG_INTERRUPT << test_shift)) { > + tasklet_hi_schedule(&engine->watchdog_tasklet); We don't need to set this as high, we definitely do want to process the live engines first and so some small latency in detecting the reset is no deal breaker. We probably don't even want to use a tasklet? (Or actually we do!) > static void hangcheck_declare_hang(struct drm_i915_private *i915, > unsigned int hung, > - unsigned int stuck) > + unsigned int stuck, > + unsigned int watchdog) That's a very interesting question as to whether we want to use the very heavy hangcheck and capture machine at all for the watchdog. > +#define GEN8_WATCHDOG_1000US 0x2ee0 //XXX: Temp, replace with helper function > +static void gen8_watchdog_irq_handler(unsigned long data) > +{ > + struct intel_engine_cs *engine = (struct intel_engine_cs *)data; > + struct drm_i915_private *dev_priv = engine->i915; > + u32 watchdog_disable, current_seqno; > + > + intel_uncore_forcewake_get(dev_priv, engine->fw_domains); > + > + if (engine->id == RCS) > + watchdog_disable = GEN8_RCS_WATCHDOG_DISABLE; > + else > + watchdog_disable = GEN8_XCS_WATCHDOG_DISABLE; > + > + /* Stop the counter to prevent further timeout interrupts */ > + I915_WRITE_FW(RING_CNTR(engine->mmio_base), watchdog_disable); > + > + /* false-positive, request completed after the timer expired */ False optimism in spotting the false positive. engine_is_idle() means all requests not the interesting one. Since you are using seqno, just reject when seqno == intel_engine_last_submit(). > + if (intel_engine_is_idle(engine)) > + goto fw_put; > + > + current_seqno = intel_engine_get_seqno(engine); > + if (engine->hangcheck.last_watchdog_seqno == current_seqno) { Or you could just reset the engine directly, once we rid it of that pesky mutex (which is done in all but name already). Doing that from inside the tasklet has some advantages -- we don't need to disable the execlists/guc tasklet. > + /* Make sure the active request will be marked as guilty */ > + engine->hangcheck.stalled = true; > + engine->hangcheck.seqno = intel_engine_get_seqno(engine); > + > + /* And try to run the hangcheck_work as soon as possible */ > + set_bit(I915_RESET_WATCHDOG, &dev_priv->gpu_error.flags); > + queue_delayed_work(system_long_wq, > + &dev_priv->gpu_error.hangcheck_work, 0); > + } else { > + engine->hangcheck.last_watchdog_seqno = current_seqno; > + /* Re-start the counter, if really hung, it will expire again */ > + I915_WRITE_FW(RING_THRESH(engine->mmio_base), GEN8_WATCHDOG_1000US); > + I915_WRITE_FW(RING_CNTR(engine->mmio_base), GEN8_WATCHDOG_ENABLE); > + } > + > +fw_put: > + intel_uncore_forcewake_put(dev_priv, engine->fw_domains); > +} > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h > index e8faf2c34c97..fffe69f5aed2 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > @@ -128,6 +128,7 @@ struct intel_instdone { > struct intel_engine_hangcheck { > u64 acthd; > u32 seqno; > + u32 last_watchdog_seqno; Just watchdog will be enough for its meaning to be clear. -Chris -- Chris Wilson, Intel Open Source Technology Centre _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx