Quoting Chris Wilson (2018-03-30 14:18:01) > We don't handle resetting the kernel context very well, or presumably any > context executing its breadcrumb commands in the ring as opposed to the > batchbuffer and flush. If we trigger a device reset twice in quick > succession while the kernel context is executing, we may end up skipping > the breadcrumb. This is really only a problem for the selftest as > normally there is a large interlude between resets (hangcheck), or we > focus on resetting just one engine and so avoid repeatedly resetting > innocents. > > Something to try would be a preempt-to-idle to quiesce the engine before > reset, so that innocent contexts would be spared the reset. > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> > Cc: Michał Winiarski <michal.winiarski@xxxxxxxxx> > CC: Michel Thierry <michel.thierry@xxxxxxxxx> Anyone? > --- > drivers/gpu/drm/i915/i915_drv.c | 3 ++ > drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 48 ++++++++++++++++++++++-- > 2 files changed, 47 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c > index d354627882e3..684060ed8db6 100644 > --- a/drivers/gpu/drm/i915/i915_drv.c > +++ b/drivers/gpu/drm/i915/i915_drv.c > @@ -1886,6 +1886,8 @@ void i915_reset(struct drm_i915_private *i915) > int ret; > int i; > > + GEM_TRACE("flags=%lx\n", error->flags); > + > might_sleep(); > lockdep_assert_held(&i915->drm.struct_mutex); > GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags)); > @@ -2016,6 +2018,7 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg) > struct i915_request *active_request; > int ret; > > + GEM_TRACE("%s flags=%lx\n", engine->name, error->flags); > GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags)); > > active_request = i915_gem_reset_prepare_engine(engine); > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > index 9e4e0ad62724..122a32e0a69d 100644 > --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > @@ -979,6 +979,23 @@ static int igt_wait_reset(void *arg) > return err; > } > > +static int wait_for_others(struct drm_i915_private *i915, > + struct intel_engine_cs *exclude) > +{ > + struct intel_engine_cs *engine; > + enum intel_engine_id id; > + > + for_each_engine(engine, i915, id) { > + if (engine == exclude) > + continue; > + > + if (wait_for(intel_engine_is_idle(engine), 10)) > + return -EIO; > + } > + > + return 0; > +} > + > static int igt_reset_queue(void *arg) > { > struct drm_i915_private *i915 = arg; > @@ -1027,13 +1044,36 @@ static int igt_reset_queue(void *arg) > i915_request_get(rq); > __i915_request_add(rq, true); > > + /* > + * XXX We don't handle resetting the kernel context > + * very well. If we trigger a device reset twice in > + * quick succession while the kernel context is > + * executing, we may end up skipping the breadcrumb. > + * This is really only a problem for the selftest as > + * normally there is a large interlude between resets > + * (hangcheck), or we focus on resetting just one > + * engine and so avoid repeatedly resetting innocents. > + */ > + err = wait_for_others(i915, engine); > + if (err) { > + pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", > + __func__, engine->name); > + i915_request_put(rq); > + i915_request_put(prev); > + > + GEM_TRACE_DUMP(); > + i915_gem_set_wedged(i915); > + goto fini; > + } > + > if (!wait_for_hang(&h, prev)) { > struct drm_printer p = drm_info_printer(i915->drm.dev); > > - pr_err("%s: Failed to start request %x, at %x\n", > - __func__, prev->fence.seqno, hws_seqno(&h, prev)); > - intel_engine_dump(prev->engine, &p, > - "%s\n", prev->engine->name); > + pr_err("%s(%s): Failed to start request %x, at %x\n", > + __func__, engine->name, > + prev->fence.seqno, hws_seqno(&h, prev)); > + intel_engine_dump(engine, &p, > + "%s\n", engine->name); > > i915_request_put(rq); > i915_request_put(prev); > -- > 2.16.3 > _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx