On Thu, Jun 06, 2013 at 10:45:42AM +0100, Chris Wilson wrote: > After kicking a ring, it should be free to make progress again and so > should not be accused of being stuck until hangcheck fires once more. In > order to catch a denial-of-service within a batch or across multiple > batches, we still do increment the hangcheck score - just not as > severely so that it takes multiple kicks to fail. > > This should address part of Ben's justified criticism of > > commit 05407ff889ceebe383aa5907219f86582ef96b72 > Author: Mika Kuoppala <mika.kuoppala at linux.intel.com> > Date: Thu May 30 09:04:29 2013 +0300 > > drm/i915: detect hang using per ring hangcheck_score > > "There's also another corner case on the kick. If the seqno = 2 > (though not stuck), and on the 3rd hangcheck, the ring is stuck, and > we try to kick it... we don't actually try to find out if the kick > helped." > > v2: Make sure we catch DoS attempts with batches full of invalid WAITs. > > References: https://bugs.freedesktop.org/show_bug.cgi?id=65394 > Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk> > Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com> > Cc: Ben Widawsky <ben at bwidawsk.net> > --- > drivers/gpu/drm/i915/i915_irq.c | 105 ++++++++++++++++++++------------------- > 1 file changed, 53 insertions(+), 52 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c > index 85694d7..2d1890d 100644 > --- a/drivers/gpu/drm/i915/i915_irq.c > +++ b/drivers/gpu/drm/i915/i915_irq.c > @@ -2324,21 +2324,11 @@ ring_last_seqno(struct intel_ring_buffer *ring) > struct drm_i915_gem_request, list)->seqno; > } > > -static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring, > - u32 ring_seqno, bool *err) > -{ > - if (list_empty(&ring->request_list) || > - i915_seqno_passed(ring_seqno, ring_last_seqno(ring))) { > - /* Issue a wake-up to catch stuck h/w. */ > - if (waitqueue_active(&ring->irq_queue)) { > - DRM_ERROR("Hangcheck timer elapsed... %s idle\n", > - ring->name); > - wake_up_all(&ring->irq_queue); > - *err = true; > - } > - return true; > - } > - return false; > +static bool > +ring_idle(struct intel_ring_buffer *ring, u32 seqno) > +{ > + return (list_empty(&ring->request_list) || > + i915_seqno_passed(seqno, ring_last_seqno(ring))); > } > > static bool semaphore_passed(struct intel_ring_buffer *ring) > @@ -2372,16 +2362,26 @@ static bool semaphore_passed(struct intel_ring_buffer *ring) > ioread32(ring->virtual_start+acthd+4)+1); > } > > -static bool kick_ring(struct intel_ring_buffer *ring) > +static bool ring_hung(struct intel_ring_buffer *ring) > { > struct drm_device *dev = ring->dev; > struct drm_i915_private *dev_priv = dev->dev_private; > - u32 tmp = I915_READ_CTL(ring); > + u32 tmp; > + > + if (IS_GEN2(dev)) > + return true; > + > + /* Is the chip hanging on a WAIT_FOR_EVENT? > + * If so we can simply poke the RB_WAIT bit > + * and break the hang. This should work on > + * all but the second generation chipsets. > + */ > + tmp = I915_READ_CTL(ring); > if (tmp & RING_WAIT) { > DRM_ERROR("Kicking stuck wait on %s\n", > ring->name); > I915_WRITE_CTL(ring, tmp); > - return true; > + return false; > } > > if (INTEL_INFO(dev)->gen >= 6 && > @@ -2390,22 +2390,10 @@ static bool kick_ring(struct intel_ring_buffer *ring) > DRM_ERROR("Kicking stuck semaphore on %s\n", > ring->name); > I915_WRITE_CTL(ring, tmp); > - return true; > - } > - return false; > -} > - > -static bool i915_hangcheck_ring_hung(struct intel_ring_buffer *ring) > -{ > - if (IS_GEN2(ring->dev)) > return false; > + } > > - /* Is the chip hanging on a WAIT_FOR_EVENT? > - * If so we can simply poke the RB_WAIT bit > - * and break the hang. This should work on > - * all but the second generation chipsets. > - */ > - return !kick_ring(ring); > + return true; > } > > /** > @@ -2423,37 +2411,50 @@ void i915_hangcheck_elapsed(unsigned long data) > struct intel_ring_buffer *ring; > int i; > int busy_count = 0, rings_hung = 0; > - bool stuck[I915_NUM_RINGS]; > + bool stuck[I915_NUM_RINGS] = { 0 }; > +#define KICK 5 > +#define HUNG 20 > +#define FIRE 30 > > if (!i915_enable_hangcheck) > return; > > for_each_ring(ring, dev_priv, i) { > u32 seqno, acthd; > - bool idle, err = false; > > seqno = ring->get_seqno(ring, false); > acthd = intel_ring_get_active_head(ring); > - idle = i915_hangcheck_ring_idle(ring, seqno, &err); > - stuck[i] = ring->hangcheck.acthd == acthd; > - > - if (idle) { > - if (err) > - ring->hangcheck.score += 2; > - else > - ring->hangcheck.score = 0; > - } else { > - busy_count++; > > - if (ring->hangcheck.seqno == seqno) { > - ring->hangcheck.score++; > - > - /* Kick ring if stuck*/ > - if (stuck[i]) > - i915_hangcheck_ring_hung(ring); > + if (ring->hangcheck.seqno == seqno) { > + if (ring_idle(ring, seqno)) { > + if (waitqueue_active(&ring->irq_queue)) { > + /* Issue a wake-up to catch stuck h/w. */ > + DRM_ERROR("Hangcheck timer elapsed... %s idle\n", > + ring->name); > + wake_up_all(&ring->irq_queue); > + ring->hangcheck.score += HUNG; > + busy_count++; > + } Correct me if I'm wrong, but we shouldn't be able to get here if the waitqueue isn't active, so perhaps a WARN for the else? > } else { > - ring->hangcheck.score = 0; > + stuck[i] = ring->hangcheck.acthd == acthd; > + if (stuck[i]) { > + /* Every time we kick the ring, add a > + * small increment to the hangcheck > + * score so that we can catch a > + * batch that is repeatedly kicked. > + */ > + ring->hangcheck.score += KICK; > + if (ring_hung(ring)) > + ring->hangcheck.score += HUNG; > + } > + busy_count++; > } > + } else { > + /* Gradually reduce the count so that we catch DoS > + * attempts across multiple batches. > + */ > + if (ring->hangcheck.score > 0) > + ring->hangcheck.score--; > } This else feels weird to me. I think we'd want to decrease at least by a multiple of KICK, or HUNG if seqno is actually moving. I'm also not certain I understand why we're worried about DoS. Did that conversation happen somewhere I can't find? > > ring->hangcheck.seqno = seqno; > @@ -2461,7 +2462,7 @@ void i915_hangcheck_elapsed(unsigned long data) > } > > for_each_ring(ring, dev_priv, i) { > - if (ring->hangcheck.score > 2) { > + if (ring->hangcheck.score > FIRE) { > rings_hung++; > DRM_ERROR("%s: %s on %s 0x%x\n", ring->name, > stuck[i] ? "stuck" : "no progress", > -- > 1.7.10.4 > -- Ben Widawsky, Intel Open Source Technology Center