On Thu, May 24, 2018 at 03:51:23PM +0100, Chris Wilson wrote: > Quoting Mika Kuoppala (2018-05-24 15:40:47) > > Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes: > > > > > We were not very carefully checking to see if an older request on the > > > engine was an earlier switch-to-kernel-context before deciding to emit a > > > new switch. The end result would be that we could get into a permanent > > > loop of trying to emit a new request to perform the switch simply to > > > flush the existing switch. > > > > > > What we need is a means of tracking the completion of each timeline > > > versus the kernel context, that is to detect if a more recent request > > > has been submitted that would result in a switch away from the kernel > > > context. To realise this, we need only to look in our syncmap on the > > > kernel context and check that we have synchronized against all active > > > rings. > > > > > > v2: Since all ringbuffer clients currently share the same timeline, we do > > > have to use the gem_context to distinguish clients. > > > > > > As a bonus, include all the tracing used to debug the death inside > > > suspend. > > > > > > v3: Test, test, test. Construct a selftest to exercise and assert the > > > expected behaviour that multiple switch-to-contexts do not emit > > > redundant requests. > > > > > > Reported-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxx> > > > Fixes: a89d1f921c15 ("drm/i915: Split i915_gem_timeline into individual timelines") > > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > > > Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxx> > > > --- > > > drivers/gpu/drm/i915/i915_gem.c | 7 + > > > drivers/gpu/drm/i915/i915_gem.h | 3 + > > > drivers/gpu/drm/i915/i915_gem_context.c | 86 +++++++++-- > > > drivers/gpu/drm/i915/i915_request.c | 5 +- > > > .../gpu/drm/i915/selftests/i915_gem_context.c | 144 ++++++++++++++++++ > > > .../drm/i915/selftests/i915_mock_selftests.h | 1 + > > > 6 files changed, 231 insertions(+), 15 deletions(-) > > > > > > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > > > index 03874b50ada9..05f44ca35a06 100644 > > > --- a/drivers/gpu/drm/i915/i915_gem.c > > > +++ b/drivers/gpu/drm/i915/i915_gem.c > > > @@ -3703,6 +3703,9 @@ static int wait_for_engines(struct drm_i915_private *i915) > > > > > > int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags) > > > { > > > + GEM_TRACE("flags=%x (%s)\n", > > > + flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked"); > > > + > > > /* If the device is asleep, we have no requests outstanding */ > > > if (!READ_ONCE(i915->gt.awake)) > > > return 0; > > > @@ -3719,6 +3722,7 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags) > > > return err; > > > } > > > i915_retire_requests(i915); > > > + GEM_BUG_ON(i915->gt.active_requests); > > > > > > return wait_for_engines(i915); > > > } else { > > > @@ -4901,6 +4905,7 @@ static void assert_kernel_context_is_current(struct drm_i915_private *i915) > > > struct intel_engine_cs *engine; > > > enum intel_engine_id id; > > > > > > + GEM_BUG_ON(i915->gt.active_requests); > > > for_each_engine(engine, i915, id) { > > > GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline.last_request)); > > > GEM_BUG_ON(engine->last_retired_context->gem_context != kctx); > > > @@ -4932,6 +4937,8 @@ int i915_gem_suspend(struct drm_i915_private *dev_priv) > > > struct drm_device *dev = &dev_priv->drm; > > > int ret; > > > > > > + GEM_TRACE("\n"); > > > + > > > intel_runtime_pm_get(dev_priv); > > > intel_suspend_gt_powersave(dev_priv); > > > > > > diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h > > > index 5bf24cfc218c..62ee4e385365 100644 > > > --- a/drivers/gpu/drm/i915/i915_gem.h > > > +++ b/drivers/gpu/drm/i915/i915_gem.h > > > @@ -63,9 +63,12 @@ struct drm_i915_private; > > > #if IS_ENABLED(CONFIG_DRM_I915_TRACE_GEM) > > > #define GEM_TRACE(...) trace_printk(__VA_ARGS__) > > > #define GEM_TRACE_DUMP() ftrace_dump(DUMP_ALL) > > > +#define GEM_TRACE_DUMP_ON(expr) \ > > > + do { if (expr) ftrace_dump(DUMP_ALL); } while (0) > > > #else > > > #define GEM_TRACE(...) do { } while (0) > > > #define GEM_TRACE_DUMP() do { } while (0) > > > +#define GEM_TRACE_DUMP_ON(expr) BUILD_BUG_ON_INVALID(expr) > > > #endif > > > > > > #define I915_NUM_ENGINES 8 > > > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c > > > index b69b18ef8120..45393f6e0208 100644 > > > --- a/drivers/gpu/drm/i915/i915_gem_context.c > > > +++ b/drivers/gpu/drm/i915/i915_gem_context.c > > > @@ -576,30 +576,72 @@ last_request_on_engine(struct i915_timeline *timeline, > > > { > > > struct i915_request *rq; > > > > > > - if (timeline == &engine->timeline) > > > - return NULL; > > > + GEM_BUG_ON(timeline == &engine->timeline); > > > > > > rq = i915_gem_active_raw(&timeline->last_request, > > > &engine->i915->drm.struct_mutex); > > > - if (rq && rq->engine == engine) > > > + if (rq && rq->engine == engine) { > > > + GEM_TRACE("last request for %s on engine %s: %llx:%d\n", > > > + timeline->name, engine->name, > > > + rq->fence.context, rq->fence.seqno); > > > + GEM_BUG_ON(rq->timeline != timeline); > > > return rq; > > > + } > > > > > > return NULL; > > > } > > > > > > -static bool engine_has_idle_kernel_context(struct intel_engine_cs *engine) > > > +static bool engine_has_kernel_context_barrier(struct intel_engine_cs *engine) > > > { > > > - struct list_head * const active_rings = &engine->i915->gt.active_rings; > > > + struct drm_i915_private *i915 = engine->i915; > > > + const struct intel_context * const ce = > > > + to_intel_context(i915->kernel_context, engine); > > > + struct i915_timeline *barrier = ce->ring->timeline; > > > struct intel_ring *ring; > > > + bool any_active = false; > > > > > > - lockdep_assert_held(&engine->i915->drm.struct_mutex); > > > + lockdep_assert_held(&i915->drm.struct_mutex); > > > + list_for_each_entry(ring, &i915->gt.active_rings, active_link) { > > > + struct i915_request *rq; > > > + > > > + rq = last_request_on_engine(ring->timeline, engine); > > > + if (!rq) > > > + continue; > > > > > > - list_for_each_entry(ring, active_rings, active_link) { > > > - if (last_request_on_engine(ring->timeline, engine)) > > > + any_active = true; > > > + > > > + if (rq->gem_context == i915->kernel_context) > > > + continue; > > > + > > > + /* > > > + * Was this request submitted after the previous > > > + * switch-to-kernel-context? > > > + */ > > > + if (!i915_timeline_sync_is_later(barrier, &rq->fence)) { > > > + GEM_TRACE("%s needs barrier for %llx:%d\n", > > > + ring->timeline->name, > > > + rq->fence.context, > > > + rq->fence.seqno); > > > return false; > > > + } > > > + > > > + GEM_TRACE("%s has barrier after %llx:%d\n", > > > + ring->timeline->name, > > > + rq->fence.context, > > > + rq->fence.seqno); > > > } > > > > > > - return intel_engine_has_kernel_context(engine); > > > + /* > > > + * If any other timeline was still active and behind the last barrier, > > > + * then our last switch-to-kernel-context must still be queued and > > > + * will run last (leaving the engine in the kernel context when it > > > + * eventually idles). > > > + */ > > > + if (any_active) > > > + return true; > > > + > > > + /* The engine is idle; check that it is idling in the kernel context. */ > > > + return engine->last_retired_context == ce; > > > } > > > > > > int i915_gem_switch_to_kernel_context(struct drm_i915_private *i915) > > > @@ -607,7 +649,10 @@ int i915_gem_switch_to_kernel_context(struct drm_i915_private *i915) > > > struct intel_engine_cs *engine; > > > enum intel_engine_id id; > > > > > > + GEM_TRACE("\n"); > > > + > > > lockdep_assert_held(&i915->drm.struct_mutex); > > > + GEM_BUG_ON(!i915->kernel_context); > > > > > > i915_retire_requests(i915); > > > > > > @@ -615,9 +660,12 @@ int i915_gem_switch_to_kernel_context(struct drm_i915_private *i915) > > > struct intel_ring *ring; > > > struct i915_request *rq; > > > > > > - if (engine_has_idle_kernel_context(engine)) > > > + GEM_BUG_ON(!to_intel_context(i915->kernel_context, engine)); > > > + if (engine_has_kernel_context_barrier(engine)) > > > continue; > > > > > > + GEM_TRACE("emit barrier on %s\n", engine->name); > > > + > > > rq = i915_request_alloc(engine, i915->kernel_context); > > > if (IS_ERR(rq)) > > > return PTR_ERR(rq); > > > @@ -627,10 +675,20 @@ int i915_gem_switch_to_kernel_context(struct drm_i915_private *i915) > > > struct i915_request *prev; > > > > > > prev = last_request_on_engine(ring->timeline, engine); > > > - if (prev) > > > - i915_sw_fence_await_sw_fence_gfp(&rq->submit, > > > - &prev->submit, > > > - I915_FENCE_GFP); > > > + if (!prev) > > > + continue; > > > + > > > + if (prev->gem_context == i915->kernel_context) > > > + continue; > > > + > > > + GEM_TRACE("add barrier on %s for %llx:%d\n", > > > + engine->name, > > > + prev->fence.context, > > > + prev->fence.seqno); > > > + i915_sw_fence_await_sw_fence_gfp(&rq->submit, > > > + &prev->submit, > > > + I915_FENCE_GFP); > > > + i915_timeline_sync_set(rq->timeline, &prev->fence); > > > } > > > > > > /* > > > diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c > > > index fc499bcbd105..f187250e60c6 100644 > > > --- a/drivers/gpu/drm/i915/i915_request.c > > > +++ b/drivers/gpu/drm/i915/i915_request.c > > > @@ -320,6 +320,7 @@ static void advance_ring(struct i915_request *request) > > > * is just about to be. Either works, if we miss the last two > > > * noops - they are safe to be replayed on a reset. > > > */ > > > + GEM_TRACE("marking %s as inactive\n", ring->timeline->name); > > > tail = READ_ONCE(request->tail); > > > list_del(&ring->active_link); > > > } else { > > > @@ -1095,8 +1096,10 @@ void __i915_request_add(struct i915_request *request, bool flush_caches) > > > i915_gem_active_set(&timeline->last_request, request); > > > > > > list_add_tail(&request->ring_link, &ring->request_list); > > > - if (list_is_first(&request->ring_link, &ring->request_list)) > > > + if (list_is_first(&request->ring_link, &ring->request_list)) { > > > + GEM_TRACE("marking %s as active\n", ring->timeline->name); > > > list_add(&ring->active_link, &request->i915->gt.active_rings); > > > + } > > > request->emitted_jiffies = jiffies; > > > > > > /* > > > diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c > > > index ddb03f009232..98a33a8fb046 100644 > > > --- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c > > > +++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c > > > @@ -26,6 +26,7 @@ > > > #include "igt_flush_test.h" > > > > > > #include "mock_drm.h" > > > +#include "mock_gem_device.h" > > > #include "huge_gem_object.h" > > > > > > #define DW_PER_PAGE (PAGE_SIZE / sizeof(u32)) > > > @@ -420,6 +421,130 @@ static int igt_ctx_exec(void *arg) > > > return err; > > > } > > > > > > +static const char *__engine_name(struct drm_i915_private *i915, > > > + unsigned int engines) > > > +{ > > > + struct intel_engine_cs *engine; > > > + unsigned int tmp; > > > + > > > + if (engines == ALL_ENGINES) > > > + return "all"; > > > + > > > + for_each_engine_masked(engine, i915, engines, tmp) > > > + return engine->name; > > > + > > > + return "none"; > > > +} > > > + > > > +static int __igt_switch_to_kernel_context(struct drm_i915_private *i915, > > > + struct i915_gem_context *ctx, > > > + unsigned int engines) > > > +{ > > > + struct intel_engine_cs *engine; > > > + unsigned int tmp; > > > + int err; > > > + > > > + GEM_TRACE("Testing %s\n", __engine_name(i915, engines)); > > > + for_each_engine_masked(engine, i915, engines, tmp) { > > > + struct i915_request *rq; > > > + > > > + rq = i915_request_alloc(engine, ctx); > > > + if (IS_ERR(rq)) > > > + return PTR_ERR(rq); > > > + > > > + i915_request_add(rq); > > > + } > > > + > > > + err = i915_gem_switch_to_kernel_context(i915); > > > + if (err) > > > + return err; > > > + > > > + for_each_engine_masked(engine, i915, engines, tmp) { > > > + if (!engine_has_kernel_context_barrier(engine)) { > > > + pr_err("kernel context not last on engine %s!\n", > > > + engine->name); > > > + return -EINVAL; > > > + } > > > + } > > > + > > > + err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED); > > > + if (err) > > > + return err; > > > + > > > + GEM_BUG_ON(i915->gt.active_requests); > > > + for_each_engine_masked(engine, i915, engines, tmp) { > > > + if (engine->last_retired_context->gem_context != i915->kernel_context) { > > > + pr_err("engine %s not idling in kernel context!\n", > > > + engine->name); > > > + return -EINVAL; > > > + } > > > + } > > > + > > > + err = i915_gem_switch_to_kernel_context(i915); > > > + if (err) > > > + return err; > > > + > > > + if (i915->gt.active_requests) { > > > + pr_err("switch-to-kernel-context emitted %d requests even though it should already be idling in the kernel context\n", > > > + i915->gt.active_requests); > > > + return -EINVAL; > > > + } > > > + > > > + for_each_engine_masked(engine, i915, engines, tmp) { > > > + if (!intel_engine_has_kernel_context(engine)) { > > > + pr_err("kernel context not last on engine %s!\n", > > > + engine->name); > > > + return -EINVAL; > > > + } > > > + } > > > + > > > + return 0; > > > +} > > > + > > > +static int igt_switch_to_kernel_context(void *arg) > > > +{ > > > + struct drm_i915_private *i915 = arg; > > > + struct intel_engine_cs *engine; > > > + struct i915_gem_context *ctx; > > > + enum intel_engine_id id; > > > + int err; > > > + > > > + /* > > > + * A core premise of switching to the kernel context is that > > > + * if an engine is already idling in the kernel context, we > > > + * do not emit another request and wake it up. The other being > > > + * that we do indeed end up idling in the kernel context. > > > + */ > > > + > > > + mutex_lock(&i915->drm.struct_mutex); > > > > As discussed in irc, > > > > Add comment or rename the ctx here and in __igt_switch_to_kernel_context > > so that the reader knows that this mock kernel context is > > used as convenient way to inject test requests. > > > > Reviewed-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> > > > > > > > + ctx = kernel_context(i915); > > I can see how that might look confusing. Will try to address the issue > with the factory names. > > Thank you for your patience in finding the bug, suggesting a fix and in > reviewing the aftermath. Pushed, for tomorrow we need to fix suspend! Hi Chris, this failed the cherry-pick for drm-intel-fixes, but looking to the log it seems something we want there right?! If so, could you please send the ported version for drm-intel-fixes targeting 4.18? Thanks, Rodrigo. > -Chris > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/intel-gfx _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx