Re: [PATCH 3/3] drm/i915/selftests: Fix up igt_reset_engine

Michel Thierry <michel.thierry@xxxxxxxxx> · Mon, 18 Dec 2017 13:50:17 -0800

On 17/12/17 05:28, Chris Wilson wrote:
Now that we skip a per-engine reset on an idle engine, we need to update
the selftest to take that into account. In the process, we find that we
were not stressing the per-engine reset very hard, so add those missing
active resets.

v2: Actually test i915_reset_engine() by loading it with requests.

Fixes: f6ba181ada55 ("drm/i915: Skip an engine reset if it recovered before our preparations")
Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
Cc: Michel Thierry <michel.thierry@xxxxxxxxx>
Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx>


Reviewed-by: Michel Thierry <michel.thierry@xxxxxxxxx>

And all these subtests passed with and without GuC in SKL.

---
  drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 314 ++++++++++++++++++-----
  1 file changed, 250 insertions(+), 64 deletions(-)

diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index f98546b8a7fa..c8a756e2139f 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -132,6 +132,12 @@ static int emit_recurse_batch(struct hang *h,
  		*batch++ = lower_32_bits(hws_address(hws, rq));
  		*batch++ = upper_32_bits(hws_address(hws, rq));
  		*batch++ = rq->fence.seqno;
+		*batch++ = MI_ARB_CHECK;
+
+		memset(batch, 0, 1024);
+		batch += 1024 / sizeof(*batch);
+
+		*batch++ = MI_ARB_CHECK;
  		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
  		*batch++ = lower_32_bits(vma->node.start);
  		*batch++ = upper_32_bits(vma->node.start);
@@ -140,6 +146,12 @@ static int emit_recurse_batch(struct hang *h,
  		*batch++ = 0;
  		*batch++ = lower_32_bits(hws_address(hws, rq));
  		*batch++ = rq->fence.seqno;
+		*batch++ = MI_ARB_CHECK;
+
+		memset(batch, 0, 1024);
+		batch += 1024 / sizeof(*batch);
+
+		*batch++ = MI_ARB_CHECK;
  		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
  		*batch++ = lower_32_bits(vma->node.start);
  	} else if (INTEL_GEN(i915) >= 4) {
@@ -147,12 +159,24 @@ static int emit_recurse_batch(struct hang *h,
  		*batch++ = 0;
  		*batch++ = lower_32_bits(hws_address(hws, rq));
  		*batch++ = rq->fence.seqno;
+		*batch++ = MI_ARB_CHECK;
+
+		memset(batch, 0, 1024);
+		batch += 1024 / sizeof(*batch);
+
+		*batch++ = MI_ARB_CHECK;
  		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
  		*batch++ = lower_32_bits(vma->node.start);
  	} else {
  		*batch++ = MI_STORE_DWORD_IMM;
  		*batch++ = lower_32_bits(hws_address(hws, rq));
  		*batch++ = rq->fence.seqno;
+		*batch++ = MI_ARB_CHECK;
+
+		memset(batch, 0, 1024);
+		batch += 1024 / sizeof(*batch);
+
+		*batch++ = MI_ARB_CHECK;
  		*batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
  		*batch++ = lower_32_bits(vma->node.start);
  	}
@@ -234,6 +258,16 @@ static void hang_fini(struct hang *h)
  	i915_gem_wait_for_idle(h->i915, I915_WAIT_LOCKED);
  }
  
+static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
+{
+	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
+					       rq->fence.seqno),
+			     10) &&
+		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
+					    rq->fence.seqno),
+			  1000));
+}
+
  static int igt_hang_sanitycheck(void *arg)
  {
  	struct drm_i915_private *i915 = arg;
@@ -296,6 +330,9 @@ static void global_reset_lock(struct drm_i915_private *i915)
  	struct intel_engine_cs *engine;
  	enum intel_engine_id id;
  
+	pr_debug("%s: current gpu_error=%08lx\n",
+		 __func__, i915->gpu_error.flags);
+
  	while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
  		wait_event(i915->gpu_error.reset_queue,
  			   !test_bit(I915_RESET_BACKOFF,
@@ -353,54 +390,127 @@ static int igt_global_reset(void *arg)
  	return err;
  }
  
-static int igt_reset_engine(void *arg)
+static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
  {
-	struct drm_i915_private *i915 = arg;
  	struct intel_engine_cs *engine;
  	enum intel_engine_id id;
-	unsigned int reset_count, reset_engine_count;
+	struct hang h;
  	int err = 0;
  
-	/* Check that we can issue a global GPU and engine reset */
+	/* Check that we can issue an engine reset on an idle engine (no-op) */
  
  	if (!intel_has_reset_engine(i915))
  		return 0;
  
+	if (active) {
+		mutex_lock(&i915->drm.struct_mutex);
+		err = hang_init(&h, i915);
+		mutex_unlock(&i915->drm.struct_mutex);
+		if (err)
+			return err;
+	}
+
  	for_each_engine(engine, i915, id) {
-		set_bit(I915_RESET_ENGINE + engine->id, &i915->gpu_error.flags);
+		unsigned int reset_count, reset_engine_count;
+		IGT_TIMEOUT(end_time);
+
+		if (active && !intel_engine_can_store_dword(engine))
+			continue;
+
  		reset_count = i915_reset_count(&i915->gpu_error);
  		reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
  							     engine);
  
-		err = i915_reset_engine(engine, I915_RESET_QUIET);
-		if (err) {
-			pr_err("i915_reset_engine failed\n");
-			break;
-		}
+		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
+		do {
+			if (active) {
+				struct drm_i915_gem_request *rq;
+
+				mutex_lock(&i915->drm.struct_mutex);
+				rq = hang_create_request(&h, engine,
+							 i915->kernel_context);
+				if (IS_ERR(rq)) {
+					err = PTR_ERR(rq);
+					break;
+				}
+
+				i915_gem_request_get(rq);
+				__i915_add_request(rq, true);
+				mutex_unlock(&i915->drm.struct_mutex);
+
+				if (!wait_for_hang(&h, rq)) {
+					struct drm_printer p = drm_info_printer(i915->drm.dev);
+
+					pr_err("%s: Failed to start request %x, at %x\n",
+					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
+					intel_engine_dump(engine, &p,
+							  "%s\n", engine->name);
+
+					i915_gem_request_put(rq);
+					err = -EIO;
+					break;
+				}
  
-		if (i915_reset_count(&i915->gpu_error) != reset_count) {
-			pr_err("Full GPU reset recorded! (engine reset expected)\n");
-			err = -EINVAL;
-			break;
-		}
+				i915_gem_request_put(rq);
+			}
+
+			engine->hangcheck.stalled = true;
+			engine->hangcheck.seqno =
+				intel_engine_get_seqno(engine);
+
+			err = i915_reset_engine(engine, I915_RESET_QUIET);
+			if (err) {
+				pr_err("i915_reset_engine failed\n");
+				break;
+			}
+
+			if (i915_reset_count(&i915->gpu_error) != reset_count) {
+				pr_err("Full GPU reset recorded! (engine reset expected)\n");
+				err = -EINVAL;
+				break;
+			}
+
+			reset_engine_count += active;
+			if (i915_reset_engine_count(&i915->gpu_error, engine) !=
+			    reset_engine_count) {
+				pr_err("%s engine reset %srecorded!\n",
+				       engine->name, active ? "not " : "");
+				err = -EINVAL;
+				break;
+			}
+
+			engine->hangcheck.stalled = false;
+		} while (time_before(jiffies, end_time));
+		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
  
-		if (i915_reset_engine_count(&i915->gpu_error, engine) ==
-		    reset_engine_count) {
-			pr_err("No %s engine reset recorded!\n", engine->name);
-			err = -EINVAL;
+		if (err)
  			break;
-		}
  
-		clear_bit(I915_RESET_ENGINE + engine->id,
-			  &i915->gpu_error.flags);
+		cond_resched();
  	}
  
  	if (i915_terminally_wedged(&i915->gpu_error))
  		err = -EIO;
  
+	if (active) {
+		mutex_lock(&i915->drm.struct_mutex);
+		hang_fini(&h);
+		mutex_unlock(&i915->drm.struct_mutex);
+	}
+
  	return err;
  }
  
+static int igt_reset_idle_engine(void *arg)
+{
+	return __igt_reset_engine(arg, false);
+}
+
+static int igt_reset_active_engine(void *arg)
+{
+	return __igt_reset_engine(arg, true);
+}
+
  static int active_engine(void *data)
  {
  	struct intel_engine_cs *engine = data;
@@ -462,11 +572,12 @@ static int active_engine(void *data)
  	return err;
  }
  
-static int igt_reset_active_engines(void *arg)
+static int __igt_reset_engine_others(struct drm_i915_private *i915,
+				     bool active)
  {
-	struct drm_i915_private *i915 = arg;
-	struct intel_engine_cs *engine, *active;
+	struct intel_engine_cs *engine, *other;
  	enum intel_engine_id id, tmp;
+	struct hang h;
  	int err = 0;
  
  	/* Check that issuing a reset on one engine does not interfere
@@ -476,24 +587,36 @@ static int igt_reset_active_engines(void *arg)
  	if (!intel_has_reset_engine(i915))
  		return 0;
  
+	if (active) {
+		mutex_lock(&i915->drm.struct_mutex);
+		err = hang_init(&h, i915);
+		mutex_unlock(&i915->drm.struct_mutex);
+		if (err)
+			return err;
+	}
+
  	for_each_engine(engine, i915, id) {
-		struct task_struct *threads[I915_NUM_ENGINES];
+		struct task_struct *threads[I915_NUM_ENGINES] = {};
  		unsigned long resets[I915_NUM_ENGINES];
  		unsigned long global = i915_reset_count(&i915->gpu_error);
+		unsigned long count = 0;
  		IGT_TIMEOUT(end_time);
  
+		if (active && !intel_engine_can_store_dword(engine))
+			continue;
+
  		memset(threads, 0, sizeof(threads));
-		for_each_engine(active, i915, tmp) {
+		for_each_engine(other, i915, tmp) {
  			struct task_struct *tsk;
  
-			if (active == engine)
-				continue;
-
  			resets[tmp] = i915_reset_engine_count(&i915->gpu_error,
-							      active);
+							      other);
  
-			tsk = kthread_run(active_engine, active,
-					  "igt/%s", active->name);
+			if (other == engine)
+				continue;
+
+			tsk = kthread_run(active_engine, other,
+					  "igt/%s", other->name);
  			if (IS_ERR(tsk)) {
  				err = PTR_ERR(tsk);
  				goto unwind;
@@ -503,20 +626,70 @@ static int igt_reset_active_engines(void *arg)
  			get_task_struct(tsk);
  		}
  
-		set_bit(I915_RESET_ENGINE + engine->id, &i915->gpu_error.flags);
+		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
  		do {
+			if (active) {
+				struct drm_i915_gem_request *rq;
+
+				mutex_lock(&i915->drm.struct_mutex);
+				rq = hang_create_request(&h, engine,
+							 i915->kernel_context);
+				if (IS_ERR(rq)) {
+					err = PTR_ERR(rq);
+					mutex_unlock(&i915->drm.struct_mutex);
+					break;
+				}
+
+				i915_gem_request_get(rq);
+				__i915_add_request(rq, true);
+				mutex_unlock(&i915->drm.struct_mutex);
+
+				if (!wait_for_hang(&h, rq)) {
+					struct drm_printer p = drm_info_printer(i915->drm.dev);
+
+					pr_err("%s: Failed to start request %x, at %x\n",
+					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
+					intel_engine_dump(engine, &p,
+							  "%s\n", engine->name);
+
+					i915_gem_request_put(rq);
+					err = -EIO;
+					break;
+				}
+
+				i915_gem_request_put(rq);
+			}
+
+			engine->hangcheck.stalled = true;
+			engine->hangcheck.seqno =
+				intel_engine_get_seqno(engine);
+
  			err = i915_reset_engine(engine, I915_RESET_QUIET);
  			if (err) {
-				pr_err("i915_reset_engine(%s) failed, err=%d\n",
-				       engine->name, err);
+				pr_err("i915_reset_engine(%s:%s) failed, err=%d\n",
+				       engine->name, active ? "active" : "idle", err);
  				break;
  			}
+
+			engine->hangcheck.stalled = false;
+			count++;
  		} while (time_before(jiffies, end_time));
-		clear_bit(I915_RESET_ENGINE + engine->id,
-			  &i915->gpu_error.flags);
+		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
+		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
+			engine->name, active ? "active" : "idle", count);
+
+		if (i915_reset_engine_count(&i915->gpu_error, engine) -
+		    resets[engine->id] != (active ? count : 0)) {
+			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
+			       engine->name, active ? "active" : "idle", count,
+			       i915_reset_engine_count(&i915->gpu_error,
+						       engine) - resets[engine->id]);
+			if (!err)
+				err = -EINVAL;
+		}
  
  unwind:
-		for_each_engine(active, i915, tmp) {
+		for_each_engine(other, i915, tmp) {
  			int ret;
  
  			if (!threads[tmp])
@@ -524,27 +697,29 @@ static int igt_reset_active_engines(void *arg)
  
  			ret = kthread_stop(threads[tmp]);
  			if (ret) {
-				pr_err("kthread for active engine %s failed, err=%d\n",
-				       active->name, ret);
+				pr_err("kthread for other engine %s failed, err=%d\n",
+				       other->name, ret);
  				if (!err)
  					err = ret;
  			}
  			put_task_struct(threads[tmp]);
  
  			if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error,
-								   active)) {
+								   other)) {
  				pr_err("Innocent engine %s was reset (count=%ld)\n",
-				       active->name,
+				       other->name,
  				       i915_reset_engine_count(&i915->gpu_error,
-							       active) - resets[tmp]);
-				err = -EIO;
+							       other) - resets[tmp]);
+				if (!err)
+					err = -EINVAL;
  			}
  		}
  
  		if (global != i915_reset_count(&i915->gpu_error)) {
  			pr_err("Global reset (count=%ld)!\n",
  			       i915_reset_count(&i915->gpu_error) - global);
-			err = -EIO;
+			if (!err)
+				err = -EINVAL;
  		}
  
  		if (err)
@@ -556,9 +731,25 @@ static int igt_reset_active_engines(void *arg)
  	if (i915_terminally_wedged(&i915->gpu_error))
  		err = -EIO;
  
+	if (active) {
+		mutex_lock(&i915->drm.struct_mutex);
+		hang_fini(&h);
+		mutex_unlock(&i915->drm.struct_mutex);
+	}
+
  	return err;
  }
  
+static int igt_reset_idle_engine_others(void *arg)
+{
+	return __igt_reset_engine_others(arg, false);
+}
+
+static int igt_reset_active_engine_others(void *arg)
+{
+	return __igt_reset_engine_others(arg, true);
+}
+
  static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
  {
  	u32 reset_count;
@@ -574,16 +765,6 @@ static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
  	return reset_count;
  }
  
-static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
-{
-	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
-					       rq->fence.seqno),
-			     10) &&
-		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
-					    rq->fence.seqno),
-			  1000));
-}
-
  static int igt_wait_reset(void *arg)
  {
  	struct drm_i915_private *i915 = arg;
@@ -617,8 +798,8 @@ static int igt_wait_reset(void *arg)
  	if (!wait_for_hang(&h, rq)) {
  		struct drm_printer p = drm_info_printer(i915->drm.dev);
  
-		pr_err("Failed to start request %x, at %x\n",
-		       rq->fence.seqno, hws_seqno(&h, rq));
+		pr_err("%s: Failed to start request %x, at %x\n",
+		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
  		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
  
  		i915_reset(i915, 0);
@@ -712,8 +893,8 @@ static int igt_reset_queue(void *arg)
  			if (!wait_for_hang(&h, prev)) {
  				struct drm_printer p = drm_info_printer(i915->drm.dev);
  
-				pr_err("Failed to start request %x, at %x\n",
-				       prev->fence.seqno, hws_seqno(&h, prev));
+				pr_err("%s: Failed to start request %x, at %x\n",
+				       __func__, prev->fence.seqno, hws_seqno(&h, prev));
  				intel_engine_dump(prev->engine, &p,
  						  "%s\n", prev->engine->name);
  
@@ -819,8 +1000,8 @@ static int igt_handle_error(void *arg)
  	if (!wait_for_hang(&h, rq)) {
  		struct drm_printer p = drm_info_printer(i915->drm.dev);
  
-		pr_err("Failed to start request %x, at %x\n",
-		       rq->fence.seqno, hws_seqno(&h, rq));
+		pr_err("%s: Failed to start request %x, at %x\n",
+		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
  		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
  
  		i915_reset(i915, 0);
@@ -864,21 +1045,26 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
  	static const struct i915_subtest tests[] = {
  		SUBTEST(igt_global_reset), /* attempt to recover GPU first */
  		SUBTEST(igt_hang_sanitycheck),
-		SUBTEST(igt_reset_engine),
-		SUBTEST(igt_reset_active_engines),
+		SUBTEST(igt_reset_idle_engine),
+		SUBTEST(igt_reset_active_engine),
+		SUBTEST(igt_reset_idle_engine_others),
+		SUBTEST(igt_reset_active_engine_others),
  		SUBTEST(igt_wait_reset),
  		SUBTEST(igt_reset_queue),
  		SUBTEST(igt_handle_error),
  	};
+	bool saved_hangcheck;
  	int err;
  
  	if (!intel_has_gpu_reset(i915))
  		return 0;
  
  	intel_runtime_pm_get(i915);
+	saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
  
  	err = i915_subtests(tests, i915);
  
+	i915_modparams.enable_hangcheck = saved_hangcheck;
  	intel_runtime_pm_put(i915);
  
  	return err;

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx