Re: [PATCH] drm/i915: Look for an active kernel context before switching

Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> · Thu, 24 May 2018 17:40:47 +0300

Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes:

> We were not very carefully checking to see if an older request on the
> engine was an earlier switch-to-kernel-context before deciding to emit a
> new switch. The end result would be that we could get into a permanent
> loop of trying to emit a new request to perform the switch simply to
> flush the existing switch.
>
> What we need is a means of tracking the completion of each timeline
> versus the kernel context, that is to detect if a more recent request
> has been submitted that would result in a switch away from the kernel
> context. To realise this, we need only to look in our syncmap on the
> kernel context and check that we have synchronized against all active
> rings.
>
> v2: Since all ringbuffer clients currently share the same timeline, we do
> have to use the gem_context to distinguish clients.
>
> As a bonus, include all the tracing used to debug the death inside
> suspend.
>
> v3: Test, test, test. Construct a selftest to exercise and assert the
> expected behaviour that multiple switch-to-contexts do not emit
> redundant requests.
>
> Reported-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxx>
> Fixes: a89d1f921c15 ("drm/i915: Split i915_gem_timeline into individual timelines")
> Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
> Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxx>
> ---
>  drivers/gpu/drm/i915/i915_gem.c               |   7 +
>  drivers/gpu/drm/i915/i915_gem.h               |   3 +
>  drivers/gpu/drm/i915/i915_gem_context.c       |  86 +++++++++--
>  drivers/gpu/drm/i915/i915_request.c           |   5 +-
>  .../gpu/drm/i915/selftests/i915_gem_context.c | 144 ++++++++++++++++++
>  .../drm/i915/selftests/i915_mock_selftests.h  |   1 +
>  6 files changed, 231 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 03874b50ada9..05f44ca35a06 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3703,6 +3703,9 @@ static int wait_for_engines(struct drm_i915_private *i915)
>  
>  int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
>  {
> +	GEM_TRACE("flags=%x (%s)\n",
> +		  flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked");
> +
>  	/* If the device is asleep, we have no requests outstanding */
>  	if (!READ_ONCE(i915->gt.awake))
>  		return 0;
> @@ -3719,6 +3722,7 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
>  				return err;
>  		}
>  		i915_retire_requests(i915);
> +		GEM_BUG_ON(i915->gt.active_requests);
>  
>  		return wait_for_engines(i915);
>  	} else {
> @@ -4901,6 +4905,7 @@ static void assert_kernel_context_is_current(struct drm_i915_private *i915)
>  	struct intel_engine_cs *engine;
>  	enum intel_engine_id id;
>  
> +	GEM_BUG_ON(i915->gt.active_requests);
>  	for_each_engine(engine, i915, id) {
>  		GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline.last_request));
>  		GEM_BUG_ON(engine->last_retired_context->gem_context != kctx);
> @@ -4932,6 +4937,8 @@ int i915_gem_suspend(struct drm_i915_private *dev_priv)
>  	struct drm_device *dev = &dev_priv->drm;
>  	int ret;
>  
> +	GEM_TRACE("\n");
> +
>  	intel_runtime_pm_get(dev_priv);
>  	intel_suspend_gt_powersave(dev_priv);
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h
> index 5bf24cfc218c..62ee4e385365 100644
> --- a/drivers/gpu/drm/i915/i915_gem.h
> +++ b/drivers/gpu/drm/i915/i915_gem.h
> @@ -63,9 +63,12 @@ struct drm_i915_private;
>  #if IS_ENABLED(CONFIG_DRM_I915_TRACE_GEM)
>  #define GEM_TRACE(...) trace_printk(__VA_ARGS__)
>  #define GEM_TRACE_DUMP() ftrace_dump(DUMP_ALL)
> +#define GEM_TRACE_DUMP_ON(expr) \
> +	do { if (expr) ftrace_dump(DUMP_ALL); } while (0)
>  #else
>  #define GEM_TRACE(...) do { } while (0)
>  #define GEM_TRACE_DUMP() do { } while (0)
> +#define GEM_TRACE_DUMP_ON(expr) BUILD_BUG_ON_INVALID(expr)
>  #endif
>  
>  #define I915_NUM_ENGINES 8
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index b69b18ef8120..45393f6e0208 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -576,30 +576,72 @@ last_request_on_engine(struct i915_timeline *timeline,
>  {
>  	struct i915_request *rq;
>  
> -	if (timeline == &engine->timeline)
> -		return NULL;
> +	GEM_BUG_ON(timeline == &engine->timeline);
>  
>  	rq = i915_gem_active_raw(&timeline->last_request,
>  				 &engine->i915->drm.struct_mutex);
> -	if (rq && rq->engine == engine)
> +	if (rq && rq->engine == engine) {
> +		GEM_TRACE("last request for %s on engine %s: %llx:%d\n",
> +			  timeline->name, engine->name,
> +			  rq->fence.context, rq->fence.seqno);
> +		GEM_BUG_ON(rq->timeline != timeline);
>  		return rq;
> +	}
>  
>  	return NULL;
>  }
>  
> -static bool engine_has_idle_kernel_context(struct intel_engine_cs *engine)
> +static bool engine_has_kernel_context_barrier(struct intel_engine_cs *engine)
>  {
> -	struct list_head * const active_rings = &engine->i915->gt.active_rings;
> +	struct drm_i915_private *i915 = engine->i915;
> +	const struct intel_context * const ce =
> +		to_intel_context(i915->kernel_context, engine);
> +	struct i915_timeline *barrier = ce->ring->timeline;
>  	struct intel_ring *ring;
> +	bool any_active = false;
>  
> -	lockdep_assert_held(&engine->i915->drm.struct_mutex);
> +	lockdep_assert_held(&i915->drm.struct_mutex);
> +	list_for_each_entry(ring, &i915->gt.active_rings, active_link) {
> +		struct i915_request *rq;
> +
> +		rq = last_request_on_engine(ring->timeline, engine);
> +		if (!rq)
> +			continue;
>  
> -	list_for_each_entry(ring, active_rings, active_link) {
> -		if (last_request_on_engine(ring->timeline, engine))
> +		any_active = true;
> +
> +		if (rq->gem_context == i915->kernel_context)
> +			continue;
> +
> +		/*
> +		 * Was this request submitted after the previous
> +		 * switch-to-kernel-context?
> +		 */
> +		if (!i915_timeline_sync_is_later(barrier, &rq->fence)) {
> +			GEM_TRACE("%s needs barrier for %llx:%d\n",
> +				  ring->timeline->name,
> +				  rq->fence.context,
> +				  rq->fence.seqno);
>  			return false;
> +		}
> +
> +		GEM_TRACE("%s has barrier after %llx:%d\n",
> +			  ring->timeline->name,
> +			  rq->fence.context,
> +			  rq->fence.seqno);
>  	}
>  
> -	return intel_engine_has_kernel_context(engine);
> +	/*
> +	 * If any other timeline was still active and behind the last barrier,
> +	 * then our last switch-to-kernel-context must still be queued and
> +	 * will run last (leaving the engine in the kernel context when it
> +	 * eventually idles).
> +	 */
> +	if (any_active)
> +		return true;
> +
> +	/* The engine is idle; check that it is idling in the kernel context. */
> +	return engine->last_retired_context == ce;
>  }
>  
>  int i915_gem_switch_to_kernel_context(struct drm_i915_private *i915)
> @@ -607,7 +649,10 @@ int i915_gem_switch_to_kernel_context(struct drm_i915_private *i915)
>  	struct intel_engine_cs *engine;
>  	enum intel_engine_id id;
>  
> +	GEM_TRACE("\n");
> +
>  	lockdep_assert_held(&i915->drm.struct_mutex);
> +	GEM_BUG_ON(!i915->kernel_context);
>  
>  	i915_retire_requests(i915);
>  
> @@ -615,9 +660,12 @@ int i915_gem_switch_to_kernel_context(struct drm_i915_private *i915)
>  		struct intel_ring *ring;
>  		struct i915_request *rq;
>  
> -		if (engine_has_idle_kernel_context(engine))
> +		GEM_BUG_ON(!to_intel_context(i915->kernel_context, engine));
> +		if (engine_has_kernel_context_barrier(engine))
>  			continue;
>  
> +		GEM_TRACE("emit barrier on %s\n", engine->name);
> +
>  		rq = i915_request_alloc(engine, i915->kernel_context);
>  		if (IS_ERR(rq))
>  			return PTR_ERR(rq);
> @@ -627,10 +675,20 @@ int i915_gem_switch_to_kernel_context(struct drm_i915_private *i915)
>  			struct i915_request *prev;
>  
>  			prev = last_request_on_engine(ring->timeline, engine);
> -			if (prev)
> -				i915_sw_fence_await_sw_fence_gfp(&rq->submit,
> -								 &prev->submit,
> -								 I915_FENCE_GFP);
> +			if (!prev)
> +				continue;
> +
> +			if (prev->gem_context == i915->kernel_context)
> +				continue;
> +
> +			GEM_TRACE("add barrier on %s for %llx:%d\n",
> +				  engine->name,
> +				  prev->fence.context,
> +				  prev->fence.seqno);
> +			i915_sw_fence_await_sw_fence_gfp(&rq->submit,
> +							 &prev->submit,
> +							 I915_FENCE_GFP);
> +			i915_timeline_sync_set(rq->timeline, &prev->fence);
>  		}
>  
>  		/*
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index fc499bcbd105..f187250e60c6 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -320,6 +320,7 @@ static void advance_ring(struct i915_request *request)
>  		 * is just about to be. Either works, if we miss the last two
>  		 * noops - they are safe to be replayed on a reset.
>  		 */
> +		GEM_TRACE("marking %s as inactive\n", ring->timeline->name);
>  		tail = READ_ONCE(request->tail);
>  		list_del(&ring->active_link);
>  	} else {
> @@ -1095,8 +1096,10 @@ void __i915_request_add(struct i915_request *request, bool flush_caches)
>  	i915_gem_active_set(&timeline->last_request, request);
>  
>  	list_add_tail(&request->ring_link, &ring->request_list);
> -	if (list_is_first(&request->ring_link, &ring->request_list))
> +	if (list_is_first(&request->ring_link, &ring->request_list)) {
> +		GEM_TRACE("marking %s as active\n", ring->timeline->name);
>  		list_add(&ring->active_link, &request->i915->gt.active_rings);
> +	}
>  	request->emitted_jiffies = jiffies;
>  
>  	/*
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> index ddb03f009232..98a33a8fb046 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> @@ -26,6 +26,7 @@
>  #include "igt_flush_test.h"
>  
>  #include "mock_drm.h"
> +#include "mock_gem_device.h"
>  #include "huge_gem_object.h"
>  
>  #define DW_PER_PAGE (PAGE_SIZE / sizeof(u32))
> @@ -420,6 +421,130 @@ static int igt_ctx_exec(void *arg)
>  	return err;
>  }
>  
> +static const char *__engine_name(struct drm_i915_private *i915,
> +				 unsigned int engines)
> +{
> +	struct intel_engine_cs *engine;
> +	unsigned int tmp;
> +
> +	if (engines == ALL_ENGINES)
> +		return "all";
> +
> +	for_each_engine_masked(engine, i915, engines, tmp)
> +		return engine->name;
> +
> +	return "none";
> +}
> +
> +static int __igt_switch_to_kernel_context(struct drm_i915_private *i915,
> +					  struct i915_gem_context *ctx,
> +					  unsigned int engines)
> +{
> +	struct intel_engine_cs *engine;
> +	unsigned int tmp;
> +	int err;
> +
> +	GEM_TRACE("Testing %s\n", __engine_name(i915, engines));
> +	for_each_engine_masked(engine, i915, engines, tmp) {
> +		struct i915_request *rq;
> +
> +		rq = i915_request_alloc(engine, ctx);
> +		if (IS_ERR(rq))
> +			return PTR_ERR(rq);
> +
> +		i915_request_add(rq);
> +	}
> +
> +	err = i915_gem_switch_to_kernel_context(i915);
> +	if (err)
> +		return err;
> +
> +	for_each_engine_masked(engine, i915, engines, tmp) {
> +		if (!engine_has_kernel_context_barrier(engine)) {
> +			pr_err("kernel context not last on engine %s!\n",
> +			       engine->name);
> +			return -EINVAL;
> +		}
> +	}
> +
> +	err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
> +	if (err)
> +		return err;
> +
> +	GEM_BUG_ON(i915->gt.active_requests);
> +	for_each_engine_masked(engine, i915, engines, tmp) {
> +		if (engine->last_retired_context->gem_context != i915->kernel_context) {
> +			pr_err("engine %s not idling in kernel context!\n",
> +			       engine->name);
> +			return -EINVAL;
> +		}
> +	}
> +
> +	err = i915_gem_switch_to_kernel_context(i915);
> +	if (err)
> +		return err;
> +
> +	if (i915->gt.active_requests) {
> +		pr_err("switch-to-kernel-context emitted %d requests even though it should already be idling in the kernel context\n",
> +		       i915->gt.active_requests);
> +		return -EINVAL;
> +	}
> +
> +	for_each_engine_masked(engine, i915, engines, tmp) {
> +		if (!intel_engine_has_kernel_context(engine)) {
> +			pr_err("kernel context not last on engine %s!\n",
> +			       engine->name);
> +			return -EINVAL;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static int igt_switch_to_kernel_context(void *arg)
> +{
> +	struct drm_i915_private *i915 = arg;
> +	struct intel_engine_cs *engine;
> +	struct i915_gem_context *ctx;
> +	enum intel_engine_id id;
> +	int err;
> +
> +	/*
> +	 * A core premise of switching to the kernel context is that
> +	 * if an engine is already idling in the kernel context, we
> +	 * do not emit another request and wake it up. The other being
> +	 * that we do indeed end up idling in the kernel context.
> +	 */
> +
> +	mutex_lock(&i915->drm.struct_mutex);

As discussed in irc,

Add comment or rename the ctx here and in __igt_switch_to_kernel_context
so that the reader knows that this mock kernel context is
used as convenient way to inject test requests.

Reviewed-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx>

> +	ctx = kernel_context(i915);
> +	if (IS_ERR(ctx)) {
> +		err = PTR_ERR(ctx);
> +		goto out_unlock;
> +	}
> +
> +	/* First check idling each individual engine */
> +	for_each_engine(engine, i915, id) {
> +		err = __igt_switch_to_kernel_context(i915, ctx, BIT(id));
> +		if (err)
> +			goto out_unlock;
> +	}
> +
> +	/* Now en masse */
> +	err = __igt_switch_to_kernel_context(i915, ctx, ALL_ENGINES);
> +	if (err)
> +		goto out_unlock;
> +
> +out_unlock:
> +	GEM_TRACE_DUMP_ON(err);
> +	if (igt_flush_test(i915, I915_WAIT_LOCKED))
> +		err = -EIO;
> +	mutex_unlock(&i915->drm.struct_mutex);
> +
> +	kernel_context_close(ctx);
> +	return err;
> +}
> +
>  static int fake_aliasing_ppgtt_enable(struct drm_i915_private *i915)
>  {
>  	struct drm_i915_gem_object *obj;
> @@ -447,9 +572,28 @@ static void fake_aliasing_ppgtt_disable(struct drm_i915_private *i915)
>  	i915_gem_fini_aliasing_ppgtt(i915);
>  }
>  
> +int i915_gem_context_mock_selftests(void)
> +{
> +	static const struct i915_subtest tests[] = {
> +		SUBTEST(igt_switch_to_kernel_context),
> +	};
> +	struct drm_i915_private *i915;
> +	int err;
> +
> +	i915 = mock_gem_device();
> +	if (!i915)
> +		return -ENOMEM;
> +
> +	err = i915_subtests(tests, i915);
> +
> +	drm_dev_unref(&i915->drm);
> +	return err;
> +}
> +
>  int i915_gem_context_live_selftests(struct drm_i915_private *dev_priv)
>  {
>  	static const struct i915_subtest tests[] = {
> +		SUBTEST(igt_switch_to_kernel_context),
>  		SUBTEST(igt_ctx_exec),
>  	};
>  	bool fake_alias = false;
> diff --git a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> index d16d74178e9d..1b70208eeea7 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> +++ b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> @@ -24,3 +24,4 @@ selftest(vma, i915_vma_mock_selftests)
>  selftest(evict, i915_gem_evict_mock_selftests)
>  selftest(gtt, i915_gem_gtt_mock_selftests)
>  selftest(hugepages, i915_gem_huge_page_mock_selftests)
> +selftest(contexts, i915_gem_context_mock_selftests)
> -- 
> 2.17.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx