On Mon, Aug 26, 2013 at 02:51:53PM +0100, Chris Wilson wrote: > When we switched to always using a timeout in conjunction with > wait_seqno, we lost the ability to detect missed interrupts. Since, we > have had issues with interrupts on a number of generations, and they are > required to be delivered in a timely fashion for a smooth UX, it is > important that we do log errors found in the wild and prevent the > display stalling for upwards of 1s every time the seqno interrupt is > missed. > > Rather than continue to fix up the timeouts to work around the interface > impedence in wait_event_*(), open code the combination of > wait_event[_interruptible][_timeout], and use the exposed timer to > poll for seqno should we detect a lost interrupt. > > v2: In order to satisfy the debug requirement of logging missed > interrupts with the real world requirments of making machines work even > if interrupts are hosed, we revert to polling after detecting a missed > interrupt. > > v3: Throw in a debugfs interface to simulate broken hw not reporting > interrupts. This interface should be a separate patch. > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_debugfs.c | 68 ++++++++++++++++++++ > drivers/gpu/drm/i915/i915_drv.h | 5 ++ > drivers/gpu/drm/i915/i915_gem.c | 114 ++++++++++++++++++++-------------- > drivers/gpu/drm/i915/i915_gpu_error.c | 1 + > drivers/gpu/drm/i915/i915_irq.c | 11 ++-- > 5 files changed, 148 insertions(+), 51 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c > index 55ab924..e823169 100644 > --- a/drivers/gpu/drm/i915/i915_debugfs.c > +++ b/drivers/gpu/drm/i915/i915_debugfs.c > @@ -1876,6 +1876,72 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_ring_stop_fops, > i915_ring_stop_get, i915_ring_stop_set, > "0x%08llx\n"); > > +static int > +i915_ring_missed_irq_get(void *data, u64 *val) > +{ > + struct drm_device *dev = data; > + drm_i915_private_t *dev_priv = dev->dev_private; > + > + *val = dev_priv->gpu_error.missed_irq_rings; > + return 0; > +} > + > +static int > +i915_ring_missed_irq_set(void *data, u64 val) > +{ > + struct drm_device *dev = data; > + struct drm_i915_private *dev_priv = dev->dev_private; > + int ret; > + > + /* Lock against concurrent debugfs callers */ > + ret = mutex_lock_interruptible(&dev->struct_mutex); > + if (ret) > + return ret; > + dev_priv->gpu_error.missed_irq_rings = val; > + mutex_unlock(&dev->struct_mutex); > + > + return 0; > +} > + > +DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops, > + i915_ring_missed_irq_get, i915_ring_missed_irq_set, > + "0x%08llx\n"); > + > +static int > +i915_ring_test_irq_get(void *data, u64 *val) > +{ > + struct drm_device *dev = data; > + drm_i915_private_t *dev_priv = dev->dev_private; > + > + *val = dev_priv->gpu_error.test_irq_rings; > + > + return 0; > +} > + > +static int > +i915_ring_test_irq_set(void *data, u64 val) > +{ > + struct drm_device *dev = data; > + struct drm_i915_private *dev_priv = dev->dev_private; > + int ret; > + > + DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val); > + > + /* Lock against concurrent debugfs callers */ > + ret = mutex_lock_interruptible(&dev->struct_mutex); > + if (ret) > + return ret; > + > + dev_priv->gpu_error.test_irq_rings = val; > + mutex_unlock(&dev->struct_mutex); > + > + return 0; > +} > + > +DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops, > + i915_ring_test_irq_get, i915_ring_test_irq_set, > + "0x%08llx\n"); > + > #define DROP_UNBOUND 0x1 > #define DROP_BOUND 0x2 > #define DROP_RETIRE 0x4 > @@ -2269,6 +2335,8 @@ static struct i915_debugfs_files { > {"i915_min_freq", &i915_min_freq_fops}, > {"i915_cache_sharing", &i915_cache_sharing_fops}, > {"i915_ring_stop", &i915_ring_stop_fops}, > + {"i915_ring_missed_irq", &i915_ring_missed_irq_fops}, > + {"i915_ring_test_irq", &i915_ring_test_irq_fops}, > {"i915_gem_drop_caches", &i915_drop_caches_fops}, > {"i915_error_state", &i915_error_state_fops}, > {"i915_next_seqno", &i915_next_seqno_fops}, > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 84b95b1..ac6db6e 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -993,6 +993,8 @@ struct i915_gpu_error { > > unsigned long last_reset; > > + unsigned long missed_irq_rings; > + > /** > * State variable and reset counter controlling the reset flow > * > @@ -1031,6 +1033,9 @@ struct i915_gpu_error { > > /* For gpu hang simulation. */ > unsigned int stop_rings; > + > + /* For missed irq/seqno simulation. */ > + unsigned int test_irq_rings; > }; > > enum modeset_restore { > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index ccfebf5..4ed782f 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -970,6 +970,17 @@ i915_gem_check_olr(struct intel_ring_buffer *ring, u32 seqno) > return ret; > } > > +static void fake_irq(unsigned long data) > +{ > + wake_up_process((struct task_struct *)data); > +} > + > +static bool missed_irq(struct drm_i915_private *dev_priv, > + struct intel_ring_buffer *ring) > +{ > + return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings); > +} > + > /** > * __wait_seqno - wait until execution of seqno has finished > * @ring: the ring expected to report seqno > @@ -993,10 +1004,9 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno, > bool interruptible, struct timespec *timeout) > { > drm_i915_private_t *dev_priv = ring->dev->dev_private; > - struct timespec before, now, wait_time={1,0}; > - unsigned long timeout_jiffies; > - long end; > - bool wait_forever = true; > + struct timespec before, now; > + DEFINE_WAIT(wait); > + long timeout_jiffies; > int ret; > > WARN(dev_priv->pc8.irqs_disabled, "IRQs disabled\n"); > @@ -1004,51 +1014,71 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno, > if (i915_seqno_passed(ring->get_seqno(ring, true), seqno)) > return 0; > > - trace_i915_gem_request_wait_begin(ring, seqno); > - > - if (timeout != NULL) { > - wait_time = *timeout; > - wait_forever = false; > - } > - > - timeout_jiffies = timespec_to_jiffies_timeout(&wait_time); > + timeout_jiffies = timeout ? timespec_to_jiffies_timeout(timeout) : 1; > > - if (WARN_ON(!ring->irq_get(ring))) > + if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)) && > + WARN_ON(!ring->irq_get(ring))) > return -ENODEV; > > - /* Record current time in case interrupted by signal, or wedged * */ > + /* Record current time in case interrupted by signal, or wedged */ > + trace_i915_gem_request_wait_begin(ring, seqno); > getrawmonotonic(&before); > + for (;;) { > + struct timer_list timer; > + unsigned long expire; > > -#define EXIT_COND \ > - (i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \ > - i915_reset_in_progress(&dev_priv->gpu_error) || \ > - reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) > - do { > - if (interruptible) > - end = wait_event_interruptible_timeout(ring->irq_queue, > - EXIT_COND, > - timeout_jiffies); > - else > - end = wait_event_timeout(ring->irq_queue, EXIT_COND, > - timeout_jiffies); > + prepare_to_wait(&ring->irq_queue, &wait, > + interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); > > /* We need to check whether any gpu reset happened in between > * the caller grabbing the seqno and now ... */ > - if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) > - end = -EAGAIN; > + if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) { > + /* ... but upgrade the -EGAIN to an -EIO if the gpu ^ EAGAIN (I realize it's copypasta) > + * is truely gone. */ ^ truly > + ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible); > + if (ret == 0) > + ret = -EAGAIN; > + break; > + } > > - /* ... but upgrade the -EGAIN to an -EIO if the gpu is truely > - * gone. */ > - ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible); > - if (ret) > - end = ret; > - } while (end == 0 && wait_forever); > + if (i915_seqno_passed(ring->get_seqno(ring, false), seqno)) { > + ret = 0; > + break; > + } > + > + if (interruptible && signal_pending(current)) { > + ret = -ERESTARTSYS; > + break; > + } > + Not sure, but I think the signal check should come first. As long as you've thought about it, I'm okay with whatever. > + if (timeout_jiffies <= 0) { > + ret = -ETIME; > + break; > + } > > + timer.function = NULL; > + if (timeout || missed_irq(dev_priv, ring)) { > + setup_timer_on_stack(&timer, fake_irq, (unsigned long)current); > + expire = jiffies + (missed_irq(dev_priv, ring) ? 1: timeout_jiffies); > + mod_timer(&timer, expire); > + } Isn't this just a fancy schedule_timeout_interruptible? > + > + schedule(); > + > + if (timeout) > + timeout_jiffies = expire - jiffies; > + > + if (timer.function) { > + del_singleshot_timer_sync(&timer); > + destroy_timer_on_stack(&timer); > + } > + } > getrawmonotonic(&now); > + trace_i915_gem_request_wait_end(ring, seqno); > > ring->irq_put(ring); > - trace_i915_gem_request_wait_end(ring, seqno); > -#undef EXIT_COND > + > + finish_wait(&ring->irq_queue, &wait); > > if (timeout) { > struct timespec sleep_time = timespec_sub(now, before); > @@ -1057,17 +1087,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno, > set_normalized_timespec(timeout, 0, 0); > } > > - switch (end) { > - case -EIO: > - case -EAGAIN: /* Wedged */ > - case -ERESTARTSYS: /* Signal */ > - return (int)end; > - case 0: /* Timeout */ > - return -ETIME; > - default: /* Completed */ > - WARN_ON(end < 0); /* We're not aware of other errors */ > - return 0; > - } > + return ret; > } > > /** > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c > index 558e568..62368f4 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.c > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c > @@ -288,6 +288,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, > err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake); > err_printf(m, "DERRMR: 0x%08x\n", error->derrmr); > err_printf(m, "CCID: 0x%08x\n", error->ccid); > + err_printf(m, "Missed interrupts: 0x%08lx\n", dev_priv->gpu_error.missed_irq_rings); > > for (i = 0; i < dev_priv->num_fence_regs; i++) > err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]); > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c > index a03b445..7ebf105 100644 > --- a/drivers/gpu/drm/i915/i915_irq.c > +++ b/drivers/gpu/drm/i915/i915_irq.c > @@ -1972,10 +1972,13 @@ static void i915_hangcheck_elapsed(unsigned long data) > if (ring_idle(ring, seqno)) { > if (waitqueue_active(&ring->irq_queue)) { > /* Issue a wake-up to catch stuck h/w. */ > - DRM_ERROR("Hangcheck timer elapsed... %s idle\n", > - ring->name); > - wake_up_all(&ring->irq_queue); > - ring->hangcheck.score += HUNG; > + if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) { > + DRM_ERROR("Hangcheck timer elapsed... %s idle\n", > + ring->name); > + wake_up_all(&ring->irq_queue); > + } > + /* Safeguard against driver failure */ > + ring->hangcheck.score += BUSY; > } else > busy = false; > } else { Honestly, I don't think using the scheduler APIs is a step in the right direction wrt readability - but I have no better solution, and it seems to do what you want. Anyway, I tried to review it, and failed. If you want to split it up more, I can try again. Meanwhile, the idea seems okay to me. -- Ben Widawsky, Intel Open Source Technology Center _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx