Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes: > Usually we have no idea about the upper bound we need to wait to catch > up with userspace when idling the device, but in a few situations we > know the system was idle beforehand and can provide a short timeout in > order to very quickly catch a failure, long before hangcheck kicks in. > > In the following patches, we will use the timeout to curtain two overly > long waits, where we know we can expect the GPU to complete within a > reasonable time or declare it broken. > > In particular, with a broken GPU we expect it to fail during the initial > GPU setup where do a couple of context switches to record the defaults. > This is a task that takes a few milliseconds even on the slowest of > devices, but we may have to wait 60s for hangcheck to give in and > declare the machine inoperable. In this a case where any gpu hang is > unacceptable, both from a timeliness and practical standpoint. > > The other improvement is that in selftests, we do not need to arm an > independent timer to inject a wedge, as we can just limit the timeout on > the wait directly. > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Joonas Lahtinen <joonas.lahtinen@xxxxxxxxxxxxxxx> > Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> > Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_debugfs.c | 6 ++- > drivers/gpu/drm/i915/i915_drv.h | 2 +- > drivers/gpu/drm/i915/i915_gem.c | 43 +++++++++++-------- > drivers/gpu/drm/i915/i915_gem_evict.c | 3 +- > drivers/gpu/drm/i915/i915_gem_gtt.c | 2 +- > drivers/gpu/drm/i915/i915_gem_shrinker.c | 11 +++-- > drivers/gpu/drm/i915/i915_perf.c | 4 +- > drivers/gpu/drm/i915/i915_request.c | 6 ++- > .../gpu/drm/i915/selftests/i915_gem_context.c | 4 +- > drivers/gpu/drm/i915/selftests/i915_request.c | 4 +- > .../gpu/drm/i915/selftests/igt_flush_test.c | 2 +- > 11 files changed, 56 insertions(+), 31 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c > index 544e5e7f011f..099f97ef2303 100644 > --- a/drivers/gpu/drm/i915/i915_debugfs.c > +++ b/drivers/gpu/drm/i915/i915_debugfs.c > @@ -4105,7 +4105,8 @@ fault_irq_set(struct drm_i915_private *i915, > > err = i915_gem_wait_for_idle(i915, > I915_WAIT_LOCKED | > - I915_WAIT_INTERRUPTIBLE); > + I915_WAIT_INTERRUPTIBLE, > + MAX_SCHEDULE_TIMEOUT); > if (err) > goto err_unlock; > > @@ -4210,7 +4211,8 @@ i915_drop_caches_set(void *data, u64 val) > if (val & DROP_ACTIVE) > ret = i915_gem_wait_for_idle(dev_priv, > I915_WAIT_INTERRUPTIBLE | > - I915_WAIT_LOCKED); > + I915_WAIT_LOCKED, > + MAX_SCHEDULE_TIMEOUT); > > if (val & DROP_RETIRE) > i915_retire_requests(dev_priv); > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 09ab12458244..bec0a2796c37 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -3154,7 +3154,7 @@ void i915_gem_init_swizzling(struct drm_i915_private *dev_priv); > void i915_gem_fini(struct drm_i915_private *dev_priv); > void i915_gem_cleanup_engines(struct drm_i915_private *dev_priv); > int i915_gem_wait_for_idle(struct drm_i915_private *dev_priv, > - unsigned int flags); > + unsigned int flags, long timeout); > int __must_check i915_gem_suspend(struct drm_i915_private *dev_priv); > void i915_gem_suspend_late(struct drm_i915_private *dev_priv); > void i915_gem_resume(struct drm_i915_private *dev_priv); > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index 1a9dab302433..093d98ed7755 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -2267,7 +2267,9 @@ static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj) > > /* Attempt to reap some mmap space from dead objects */ > do { > - err = i915_gem_wait_for_idle(dev_priv, I915_WAIT_INTERRUPTIBLE); > + err = i915_gem_wait_for_idle(dev_priv, > + I915_WAIT_INTERRUPTIBLE, > + MAX_SCHEDULE_TIMEOUT); > if (err) > break; > > @@ -3742,14 +3744,14 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file) > return ret; > } > > -static int wait_for_timeline(struct i915_timeline *tl, unsigned int flags) > +static long wait_for_timeline(struct i915_timeline *tl, > + unsigned int flags, long timeout) > { > struct i915_request *rq; > - long ret; > > rq = i915_gem_active_get_unlocked(&tl->last_request); > if (!rq) > - return 0; > + return timeout; > > /* > * "Race-to-idle". > @@ -3763,10 +3765,10 @@ static int wait_for_timeline(struct i915_timeline *tl, unsigned int flags) > if (flags & I915_WAIT_FOR_IDLE_BOOST) > gen6_rps_boost(rq, NULL); > > - ret = i915_request_wait(rq, flags, MAX_SCHEDULE_TIMEOUT); > + timeout = i915_request_wait(rq, flags, timeout); > i915_request_put(rq); > > - return ret < 0 ? ret : 0; > + return timeout; > } > > static int wait_for_engines(struct drm_i915_private *i915) > @@ -3782,7 +3784,8 @@ static int wait_for_engines(struct drm_i915_private *i915) > return 0; > } > > -int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags) > +int i915_gem_wait_for_idle(struct drm_i915_private *i915, > + unsigned int flags, long timeout) > { > GEM_TRACE("flags=%x (%s)\n", > flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked"); Did you omit enhancing the trace on purpose? Eventually i915_request_wait will assert for silly timeout values, but should we add assertion here too as wait_for_timeline will return what we put, for nonactive timelines? > @@ -3798,9 +3801,9 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags) > lockdep_assert_held(&i915->drm.struct_mutex); > > list_for_each_entry(tl, &i915->gt.timelines, link) { > - err = wait_for_timeline(tl, flags); > - if (err) > - return err; > + timeout = wait_for_timeline(tl, flags, timeout); > + if (timeout < 0) > + return timeout; > } > > err = wait_for_engines(i915); To truely serve the caller, the remaining timeout could be passed to wait_for_engines too, but that can be followup. So the only real thing is the missing trace. -Mika > @@ -3812,12 +3815,13 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags) > } else { > struct intel_engine_cs *engine; > enum intel_engine_id id; > - int err; > > for_each_engine(engine, i915, id) { > - err = wait_for_timeline(&engine->timeline, flags); > - if (err) > - return err; > + struct i915_timeline *tl = &engine->timeline; > + > + timeout = wait_for_timeline(tl, flags, timeout); > + if (timeout < 0) > + return timeout; > } > } > > @@ -5052,7 +5056,8 @@ int i915_gem_suspend(struct drm_i915_private *dev_priv) > ret = i915_gem_wait_for_idle(dev_priv, > I915_WAIT_INTERRUPTIBLE | > I915_WAIT_LOCKED | > - I915_WAIT_FOR_IDLE_BOOST); > + I915_WAIT_FOR_IDLE_BOOST, > + MAX_SCHEDULE_TIMEOUT); > if (ret && ret != -EIO) > goto err_unlock; > > @@ -5356,7 +5361,9 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915) > if (err) > goto err_active; > > - err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED); > + err = i915_gem_wait_for_idle(i915, > + I915_WAIT_LOCKED, > + MAX_SCHEDULE_TIMEOUT); > if (err) > goto err_active; > > @@ -5421,7 +5428,9 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915) > if (WARN_ON(i915_gem_switch_to_kernel_context(i915))) > goto out_ctx; > > - if (WARN_ON(i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED))) > + if (WARN_ON(i915_gem_wait_for_idle(i915, > + I915_WAIT_LOCKED, > + MAX_SCHEDULE_TIMEOUT))) > goto out_ctx; > > i915_gem_contexts_lost(i915); > diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c > index 54814a196ee4..02b83a5ed96c 100644 > --- a/drivers/gpu/drm/i915/i915_gem_evict.c > +++ b/drivers/gpu/drm/i915/i915_gem_evict.c > @@ -69,7 +69,8 @@ static int ggtt_flush(struct drm_i915_private *i915) > > err = i915_gem_wait_for_idle(i915, > I915_WAIT_INTERRUPTIBLE | > - I915_WAIT_LOCKED); > + I915_WAIT_LOCKED, > + MAX_SCHEDULE_TIMEOUT); > if (err) > return err; > > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c > index 2ad319e17e39..abd81fb9b0b6 100644 > --- a/drivers/gpu/drm/i915/i915_gem_gtt.c > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c > @@ -2793,7 +2793,7 @@ void i915_gem_gtt_finish_pages(struct drm_i915_gem_object *obj, > struct i915_ggtt *ggtt = &dev_priv->ggtt; > > if (unlikely(ggtt->do_idle_maps)) { > - if (i915_gem_wait_for_idle(dev_priv, 0)) { > + if (i915_gem_wait_for_idle(dev_priv, 0, MAX_SCHEDULE_TIMEOUT)) { > DRM_ERROR("Failed to wait for idle; VT'd may hang.\n"); > /* Wait a bit, in hopes it avoids the hang */ > udelay(10); > diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c > index 55e84e71f526..c61f5b80fee3 100644 > --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c > +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c > @@ -172,7 +172,9 @@ i915_gem_shrink(struct drm_i915_private *i915, > * we will free as much as we can and hope to get a second chance. > */ > if (flags & I915_SHRINK_ACTIVE) > - i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED); > + i915_gem_wait_for_idle(i915, > + I915_WAIT_LOCKED, > + MAX_SCHEDULE_TIMEOUT); > > trace_i915_gem_shrink(i915, target, flags); > i915_retire_requests(i915); > @@ -392,7 +394,8 @@ shrinker_lock_uninterruptible(struct drm_i915_private *i915, bool *unlock, > unsigned long timeout = jiffies + msecs_to_jiffies_timeout(timeout_ms); > > do { > - if (i915_gem_wait_for_idle(i915, 0) == 0 && > + if (i915_gem_wait_for_idle(i915, > + 0, MAX_SCHEDULE_TIMEOUT) == 0 && > shrinker_lock(i915, unlock)) > break; > > @@ -466,7 +469,9 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr > return NOTIFY_DONE; > > /* Force everything onto the inactive lists */ > - ret = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED); > + ret = i915_gem_wait_for_idle(i915, > + I915_WAIT_LOCKED, > + MAX_SCHEDULE_TIMEOUT); > if (ret) > goto out; > > diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c > index 447407fee3b8..6bf10952c724 100644 > --- a/drivers/gpu/drm/i915/i915_perf.c > +++ b/drivers/gpu/drm/i915/i915_perf.c > @@ -1836,7 +1836,9 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv, > * So far the best way to work around this issue seems to be draining > * the GPU from any submitted work. > */ > - ret = i915_gem_wait_for_idle(dev_priv, wait_flags); > + ret = i915_gem_wait_for_idle(dev_priv, > + wait_flags, > + MAX_SCHEDULE_TIMEOUT); > if (ret) > goto out; > > diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c > index 3248369dbcfb..5c2c93cbab12 100644 > --- a/drivers/gpu/drm/i915/i915_request.c > +++ b/drivers/gpu/drm/i915/i915_request.c > @@ -206,7 +206,8 @@ static int reset_all_global_seqno(struct drm_i915_private *i915, u32 seqno) > /* Carefully retire all requests without writing to the rings */ > ret = i915_gem_wait_for_idle(i915, > I915_WAIT_INTERRUPTIBLE | > - I915_WAIT_LOCKED); > + I915_WAIT_LOCKED, > + MAX_SCHEDULE_TIMEOUT); > if (ret) > return ret; > > @@ -735,7 +736,8 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx) > /* Ratelimit ourselves to prevent oom from malicious clients */ > ret = i915_gem_wait_for_idle(i915, > I915_WAIT_LOCKED | > - I915_WAIT_INTERRUPTIBLE); > + I915_WAIT_INTERRUPTIBLE, > + MAX_SCHEDULE_TIMEOUT); > if (ret) > goto err_unreserve; > > diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c > index 5fbe15f4effd..ab2590242033 100644 > --- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c > +++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c > @@ -478,7 +478,9 @@ static int __igt_switch_to_kernel_context(struct drm_i915_private *i915, > } > } > > - err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED); > + err = i915_gem_wait_for_idle(i915, > + I915_WAIT_LOCKED, > + MAX_SCHEDULE_TIMEOUT); > if (err) > return err; > > diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c > index 43995fc3534d..c4aac6141e04 100644 > --- a/drivers/gpu/drm/i915/selftests/i915_request.c > +++ b/drivers/gpu/drm/i915/selftests/i915_request.c > @@ -286,7 +286,9 @@ static int begin_live_test(struct live_test *t, > t->func = func; > t->name = name; > > - err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED); > + err = i915_gem_wait_for_idle(i915, > + I915_WAIT_LOCKED, > + MAX_SCHEDULE_TIMEOUT); > if (err) { > pr_err("%s(%s): failed to idle before, with err=%d!", > func, name, err); > diff --git a/drivers/gpu/drm/i915/selftests/igt_flush_test.c b/drivers/gpu/drm/i915/selftests/igt_flush_test.c > index 0d06f559243f..09ab037ce803 100644 > --- a/drivers/gpu/drm/i915/selftests/igt_flush_test.c > +++ b/drivers/gpu/drm/i915/selftests/igt_flush_test.c > @@ -64,7 +64,7 @@ int igt_flush_test(struct drm_i915_private *i915, unsigned int flags) > } > > wedge_on_timeout(&w, i915, HZ) > - i915_gem_wait_for_idle(i915, flags); > + i915_gem_wait_for_idle(i915, flags, MAX_SCHEDULE_TIMEOUT); > > return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0; > } > -- > 2.18.0 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx