On Mon, Dec 16, 2013 at 04:03:52PM +0000, Lister, Ian wrote: > From f71a7de85e9d81be3aa3962c8fe2557235ff21c1 Mon Sep 17 00:00:00 2001 > Message-Id: <f71a7de85e9d81be3aa3962c8fe2557235ff21c1.1387201899.git.ian.lister@xxxxxxxxx> > In-Reply-To: <cover.1387201899.git.ian.lister@xxxxxxxxx> > References: <cover.1387201899.git.ian.lister@xxxxxxxxx> > From: ian-lister <ian.lister@xxxxxxxxx> > Date: Wed, 11 Dec 2013 11:25:44 +0000 > Subject: [RFC 13/13] drm/i915: Exec buffer inserts watchdog commands > > A start timer command is inserted before the batch buffer start command > and a stop timer command is inserted afterwards. If the batch executes > normally then the timer will be cancelled before it can fire, however if > the batch buffer hangs then the watchdog timer will fire and > i915_handle_error is called to complete the recovery work. > > This patch requires the per-engine TDR patches. > > Signed-off-by: ian-lister <ian.lister@xxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_debugfs.c | 16 +++++++++++++--- > drivers/gpu/drm/i915/i915_gem_execbuffer.c | 22 ++++++++++++++++++++++ > drivers/gpu/drm/i915/i915_irq.c | 1 + > drivers/gpu/drm/i915/intel_ringbuffer.h | 1 + > 4 files changed, 37 insertions(+), 3 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c > index f00b19c..6867b096 100644 > --- a/drivers/gpu/drm/i915/i915_debugfs.c > +++ b/drivers/gpu/drm/i915/i915_debugfs.c > @@ -906,15 +906,23 @@ i915_ring_hangcheck_read(struct file *filp, > * have hung and been reset since boot */ > struct drm_device *dev = filp->private_data; > drm_i915_private_t *dev_priv = dev->dev_private; > - char buf[100]; > + char buf[200]; > int len; > > len = scnprintf(buf, sizeof(buf), > - "GPU=0x%08X,RCS=0x%08X,VCS=0x%08X,BCS=0x%08X\n", > + "GPU=0x%08X,RCS=0x%08X,VCS=0x%08X,BCS=0x%08X," > + "RCS_T=0x%08x,VCS_T=0x%08x,BCS_T=0x%08x," > + "RCS_W=0x%08x,VCS_W=0x%08x,BCS_W=0x%08x\n", > dev_priv->gpu_error.global_resets, > dev_priv->ring[RCS].hangcheck.total, > dev_priv->ring[VCS].hangcheck.total, > - dev_priv->ring[BCS].hangcheck.total); > + dev_priv->ring[BCS].hangcheck.total, > + dev_priv->ring[RCS].hangcheck.tdr_count, > + dev_priv->ring[VCS].hangcheck.tdr_count, > + dev_priv->ring[BCS].hangcheck.tdr_count, > + dev_priv->ring[RCS].hangcheck.watchdog_count, > + dev_priv->ring[VCS].hangcheck.watchdog_count, > + dev_priv->ring[BCS].hangcheck.watchdog_count); > > return simple_read_from_buffer(ubuf, max, ppos, buf, len); > } > @@ -935,6 +943,8 @@ i915_ring_hangcheck_write(struct file *filp, > for (i = 0; i < I915_NUM_RINGS; i++) { > /* Reset the hangcheck counters */ > dev_priv->ring[i].hangcheck.total = 0; > + dev_priv->ring[i].hangcheck.tdr_count = 0; > + dev_priv->ring[i].hangcheck.watchdog_count = 0; > } Looks like unrelated debug code, needs to be extracted and shuffled into the right patch (or separate debug patch). There's more like this below. > > dev_priv->gpu_error.global_resets = 0; > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > index c9d330a..722b41e 100644 > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c > @@ -32,6 +32,7 @@ > #include "i915_trace.h" > #include "intel_drv.h" > #include <linux/dma_remapping.h> > +#include "intel_ringbuffer.h" > > struct eb_vmas { > struct list_head vmas; > @@ -1196,6 +1197,17 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data, > goto err; > } > > + /* Start watchdog timer*/ > + if ((args->flags & I915_EXEC_ENABLE_WATCHDOG) && > + i915_enable_watchdog && > + intel_ring_watchdog_supported(ring)) { Imo just yell at userspace (with an -EINVAL) if it tries to use the watchdog support on unsupported rings. Which means we need a driver flag for this. Also I don't see the value of of a module option for this (I presume i915_enable_watchdog is this) since this is purely opt-in from userspace. > + > + ret = intel_ring_start_watchdog(ring); > + > + if (ret) > + goto err; > + } > + > exec_start = i915_gem_obj_offset(batch_obj, vm) + > args->batch_start_offset; > exec_len = args->batch_len; > @@ -1220,6 +1232,16 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data, > goto err; > } > > + /* Cancel watchdog timer */ > + if ((args->flags & I915_EXEC_ENABLE_WATCHDOG) && > + i915_enable_watchdog && > + intel_ring_watchdog_supported(ring)) { > + > + ret = intel_ring_stop_watchdog(ring); > + if (ret) > + goto err; > + } > + > trace_i915_gem_ring_dispatch(ring, intel_ring_get_seqno(ring), flags); > > i915_gem_execbuffer_move_to_active(&eb->vmas, ring); > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c > index c26c3db..bc7d68b 100644 > --- a/drivers/gpu/drm/i915/i915_irq.c > +++ b/drivers/gpu/drm/i915/i915_irq.c > @@ -2974,6 +2974,7 @@ static void i915_hangcheck_elapsed(unsigned long data) > busy_count += i915_hangcheck_ring_sample(ring); > > if (ring->hangcheck.score > FIRE) { > + ring->hangcheck.tdr_count++; > DRM_ERROR("Hang detected on %s\n", > ring->name); > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h > index 473cb94..4a42638 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > @@ -72,6 +72,7 @@ struct intel_ring_hangcheck { > u32 total; /* Total resets applied to this ring/engine*/ > u32 last_head; /* Head value recorded at last hang */ > u32 status_updated; > + u32 tdr_count; /* Total TDR resets for this ring */ > u32 watchdog_threshold; > u32 watchdog_count; /* Total watchdog resets for this ring */ > }; > -- > 1.8.5.1 > > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > http://lists.freedesktop.org/mailman/listinfo/intel-gfx -- Daniel Vetter Software Engineer, Intel Corporation +41 (0) 79 365 57 48 - http://blog.ffwll.ch _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx