Re: [RESEND 2/6] drm/i915/selftests: Exercise reset to break stuck GTT eviction

Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxxxxxxx> · Mon, 16 Jul 2018 10:32:25 +0100

On 16/07/2018 09:03, Chris Wilson wrote:
We must be able to reset the GPU while we are waiting on it to perform
an eviction (unbinding an active vma). So attach a spinning request to a
target vma and try and it evict it from a thread to see if that blocks
indefinitely.

Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
---
  .../gpu/drm/i915/selftests/intel_hangcheck.c  | 153 +++++++++++++++++-
  1 file changed, 151 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 73462a65a330..344973664206 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -27,6 +27,7 @@
  #include "../i915_selftest.h"
  #include "i915_random.h"
  #include "igt_flush_test.h"
+#include "igt_wedge_me.h"
  
  #include "mock_context.h"
  #include "mock_drm.h"
@@ -921,7 +922,7 @@ static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
  	return reset_count;
  }
  
-static int igt_wait_reset(void *arg)
+static int igt_reset_wait(void *arg)
  {
  	struct drm_i915_private *i915 = arg;
  	struct i915_request *rq;
@@ -995,6 +996,152 @@ static int igt_wait_reset(void *arg)
  	return err;
  }
  
+static int evict_vma(void *data)
+{
+	struct i915_vma *vma = data;
+	struct drm_i915_private *i915 = vma->vm->i915;
+	struct drm_mm_node evict = vma->node;
+	int err;
+
+	mutex_lock(&i915->drm.struct_mutex);
+	err = i915_gem_evict_for_node(vma->vm, &evict, 0);
+	mutex_unlock(&i915->drm.struct_mutex);
+
+	return err;
+}
+
+static int __igt_reset_evict_vma(struct drm_i915_private *i915,
+				 struct i915_address_space *vm)
+{
+	struct drm_i915_gem_object *obj;
+	struct task_struct *tsk = NULL;
+	struct i915_request *rq;
+	struct i915_vma *vma;
+	struct hang h;
+	int err;
+
+	if (!intel_engine_can_store_dword(i915->engine[RCS]))
+		return 0;
+
+	/* Check that we can recover an unbind stuck on a hanging request */
+
+	global_reset_lock(i915);
+
+	mutex_lock(&i915->drm.struct_mutex);
+	err = hang_init(&h, i915);
+	if (err)
+		goto unlock;
+
+	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
+	if (IS_ERR(obj)) {
+		err = PTR_ERR(obj);
+		goto fini;
+	}
+
+	vma = i915_vma_instance(obj, vm, NULL);
+	if (IS_ERR(vma)) {
+		err = PTR_ERR(vma);
+		goto out_obj;
+	}
+
+	rq = hang_create_request(&h, i915->engine[RCS]);
+	if (IS_ERR(rq)) {
+		err = PTR_ERR(rq);
+		goto out_obj;
+	}
+
+	err = i915_vma_pin(vma, 0, 0,
+			   i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER);
+	if (err)
+		goto out_obj;
+
+	err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
+	i915_vma_unpin(vma);
+
+	i915_request_get(rq);
+	i915_request_add(rq);
+	if (err)
+		goto out_rq;
+
+	mutex_unlock(&i915->drm.struct_mutex);
+
+	if (!wait_until_running(&h, rq)) {
+		struct drm_printer p = drm_info_printer(i915->drm.dev);
+
+		pr_err("%s: Failed to start request %x, at %x\n",
+		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
+		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
+
+		i915_gem_set_wedged(i915);
+		goto out_reset;
+	}
+
+	tsk = kthread_run(evict_vma, vma, "igt/evict_vma");
+
+	if (wait_for(waitqueue_active(&rq->execute), 10)) {

So this is 10ms for the thread to start and start waiting. I am trying 
to judge if it will be stable enough.. And since it cannot distinguish 
between the failure of thread to start and get to the bit where it is 
supposed to wait, versus actually not waiting.

Maybe set a flag just before i915_gem_evict_for_node and wait on that first?

And there is a comment against waitqueue_active that warns about races 
but I think it is fine in this case.

+		struct drm_printer p = drm_info_printer(i915->drm.dev);
+
+		pr_err("igt/evict_vma kthread did not wait\n");
+		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
+
+		i915_gem_set_wedged(i915);
+		goto out_reset;
+	}
+
+out_reset:
+	fake_hangcheck(rq, intel_engine_flag(rq->engine));
+
+	if (tsk) {
+		struct igt_wedge_me w;
+
+		igt_wedge_on_timeout(&w, i915, HZ / 5)
+			err = kthread_stop(tsk);

This means 200ms grace for reset to terminate the spinner and unblock 
the eviction thread? Is that enough on all platforms?

+	}
+
+	mutex_lock(&i915->drm.struct_mutex);
+out_rq:
+	i915_request_put(rq);
+out_obj:
+	i915_gem_object_put(obj);
+fini:
+	hang_fini(&h);
+unlock:
+	mutex_unlock(&i915->drm.struct_mutex);
+	global_reset_unlock(i915);
+
+	if (i915_terminally_wedged(&i915->gpu_error))
+		return -EIO;
+
+	return err;
+}
+
+static int igt_reset_evict_ggtt(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+
+	return __igt_reset_evict_vma(i915, &i915->ggtt.vm);
+}
+
+static int igt_reset_evict_ppgtt(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct i915_gem_context *ctx;
+	int err;
+
+	mutex_lock(&i915->drm.struct_mutex);
+	ctx = kernel_context(i915);
+	mutex_unlock(&i915->drm.struct_mutex);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	err = 0;
+	if (ctx->ppgtt)

Just skip the test when !USES_FULL_PPGTT?

+		err = __igt_reset_evict_vma(i915, &ctx->ppgtt->vm);
+
+	kernel_context_close(ctx);
+	return err;
+}
+
  static int wait_for_others(struct drm_i915_private *i915,
  			   struct intel_engine_cs *exclude)
  {
@@ -1240,8 +1387,10 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
  		SUBTEST(igt_reset_idle_engine),
  		SUBTEST(igt_reset_active_engine),
  		SUBTEST(igt_reset_engines),
-		SUBTEST(igt_wait_reset),
  		SUBTEST(igt_reset_queue),
+		SUBTEST(igt_reset_wait),
+		SUBTEST(igt_reset_evict_ggtt),
+		SUBTEST(igt_reset_evict_ppgtt),
  		SUBTEST(igt_handle_error),
  	};
  	bool saved_hangcheck;


Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx