On Fri, 27 Apr 2012 15:17:38 +0200 Daniel Vetter <daniel.vetter at ffwll.ch> wrote: > gpu reset is a very important piece of our infrastructure. > Unfortunately we only really it test by actually hanging the gpu, > which often has bad side-effects for the entire system. And the gpu > hang handling code is one of the rather complicated pieces of code we > have, consisting of > - hang detection > - error capture > - actual gpu reset > - reset of all the gem bookkeeping > - reinitialition of the entire gpu > > This patch adds a debugfs to selectively stopping rings by ceasing to > update the hw tail pointer, which will result in the gpu no longer > updating it's head pointer and eventually to the hangcheck firing. > This way we can exercise the gpu hang code under controlled conditions > without a dying gpu taking down the entire systems. You mean, tail pointer? > > Patch motivated by me forgetting to properly reinitialize ppgtt after > a gpu reset. > > Usage: > > echo $((1 << $ringnum)) > i915_ring_stop # stops one ring > > echo 0xffffffff > i915_ring_stop # stops all, future-proof version > > then run whatever testload is desired. i915_ring_stop automatically > resets after a gpu hang is detected to avoid hanging the gpu to fast > and declaring it wedged. > > v2: Incorporate feedback from Chris Wilson. > > v3: Add the missing cleanup. > > Signed-Off-by: Daniel Vetter <daniel.vetter at ffwll.ch> > Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk> > --- > drivers/gpu/drm/i915/i915_debugfs.c | 65 > +++++++++++++++++++++++++++++++ > drivers/gpu/drm/i915/i915_drv.h | 2 + > drivers/gpu/drm/i915/intel_ringbuffer.c | 4 ++ 3 files changed, 71 > insertions(+), 0 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c > b/drivers/gpu/drm/i915/i915_debugfs.c index 120db46..3f6e28e 100644 > --- a/drivers/gpu/drm/i915/i915_debugfs.c > +++ b/drivers/gpu/drm/i915/i915_debugfs.c > @@ -1585,6 +1585,64 @@ static const struct file_operations > i915_wedged_fops = { }; > > static ssize_t > +i915_ring_stop_read(struct file *filp, > + char __user *ubuf, > + size_t max, > + loff_t *ppos) > +{ > + struct drm_device *dev = filp->private_data; > + drm_i915_private_t *dev_priv = dev->dev_private; > + char buf[80]; > + int len; > + > + len = snprintf(buf, sizeof(buf), > + "0x%08x\n", dev_priv->stop_rings); > + > + if (len > sizeof(buf)) > + len = sizeof(buf); > + > + return simple_read_from_buffer(ubuf, max, ppos, buf, len); > +} > + > +static ssize_t > +i915_ring_stop_write(struct file *filp, > + const char __user *ubuf, > + size_t cnt, > + loff_t *ppos) > +{ > + struct drm_device *dev = filp->private_data; > + struct drm_i915_private *dev_priv = dev->dev_private; > + char buf[20]; > + int val = 0; > + > + if (cnt > 0) { > + if (cnt > sizeof(buf) - 1) > + return -EINVAL; > + > + if (copy_from_user(buf, ubuf, cnt)) > + return -EFAULT; > + buf[cnt] = 0; > + > + val = simple_strtoul(buf, NULL, 0); > + } > + > + DRM_DEBUG_DRIVER("Stopping rings 0x%08x\n", val); > + > + mutex_lock(&dev->struct_mutex); > + dev_priv->stop_rings = val; > + mutex_unlock(&dev->struct_mutex); > + > + return cnt; > +} Not a huge deal, but I'd go with an interruptible mutex lock there. Also, I think it would be cool if you did a tail update when going from !0->0 stop_rings. > + > +static const struct file_operations i915_ring_stop_fops = { > + .owner = THIS_MODULE, > + .open = simple_open, > + .read = i915_ring_stop_read, > + .write = i915_ring_stop_write, > + .llseek = default_llseek, > +}; > +static ssize_t > i915_max_freq_read(struct file *filp, > char __user *ubuf, > size_t max, > @@ -1884,6 +1942,11 @@ int i915_debugfs_init(struct drm_minor *minor) > &i915_cache_sharing_fops); > if (ret) > return ret; > + ret = i915_debugfs_create(minor->debugfs_root, minor, > + "i915_ring_stop", > + &i915_ring_stop_fops); > + if (ret) > + return ret; > > return drm_debugfs_create_files(i915_debugfs_list, > I915_DEBUGFS_ENTRIES, > @@ -1902,6 +1965,8 @@ void i915_debugfs_cleanup(struct drm_minor > *minor) 1, minor); > drm_debugfs_remove_files((struct drm_info_list *) > &i915_cache_sharing_fops, 1, minor); > + drm_debugfs_remove_files((struct drm_info_list *) > &i915_ring_stop_fops, > + 1, minor); > } > > #endif /* CONFIG_DEBUG_FS */ > diff --git a/drivers/gpu/drm/i915/i915_drv.h > b/drivers/gpu/drm/i915/i915_drv.h index 0095c8d..5e62b35 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -413,6 +413,8 @@ typedef struct drm_i915_private { > uint32_t last_instdone; > uint32_t last_instdone1; > > + unsigned int stop_rings; > + > unsigned long cfb_size; > unsigned int cfb_fb; > enum plane cfb_plane; > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c > b/drivers/gpu/drm/i915/intel_ringbuffer.c index 427b7c5..d2de16d > 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > @@ -1221,7 +1221,11 @@ int intel_ring_begin(struct intel_ring_buffer > *ring, > void intel_ring_advance(struct intel_ring_buffer *ring) > { > + struct drm_i915_private *dev_priv = ring->dev->dev_private; > + > ring->tail &= ring->size - 1; > + if (dev_priv->stop_rings & intel_ring_flag(ring)) > + return; > ring->write_tail(ring, ring->tail); > } >