[PATCH 3/5] drm/amdgpu:Impl guilty ctx feature for sriov TDR

Monk.Liu@xxxxxxx (Monk Liu) · Mon, 1 May 2017 15:22:49 +0800

if a job hang (more iterations exceeds the threshold) we
consider the entity/ctx behind it as guilty, we kick out
all jobs/entities in before sched_recovery.

with this feature driver won't suffer infinite job resubmit if
this job will always cause GPU hang.

And a new module paramter "hang_limit" is introduced as threshold
to let driver control how much time we allow a job hang
before we tag its context guilty.

Change-Id: I6c08ba126b985232e9b67530c304f09a5aeee78d
Signed-off-by: Monk Liu <Monk.Liu at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c       |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c       |  3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       | 15 ++++-
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 81 ++++++++++++++++++++++++++-
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.h |  2 +
 7 files changed, 103 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 6312cc5..f3c3c36 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -111,6 +111,7 @@ extern int amdgpu_prim_buf_per_se;
 extern int amdgpu_pos_buf_per_se;
 extern int amdgpu_cntl_sb_buf_per_se;
 extern int amdgpu_param_buf_per_se;
+extern int amdgpu_hang_limit;
 
 #define AMDGPU_DEFAULT_GTT_SIZE_MB		3072ULL /* 3GB by default */
 #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS	        3000
@@ -1148,7 +1149,7 @@ struct amdgpu_job {
 	/* user fence handling */
 	uint64_t		uf_addr;
 	uint64_t		uf_sequence;
-
+	atomic_t karma;
 };
 #define to_amdgpu_job(sched_job)		\
 		container_of((sched_job), struct amdgpu_job, base)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index 3947f63..0083153 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -64,7 +64,7 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev, struct amdgpu_ctx *ctx)
 		if (r)
 			goto failed;
 
-		ctx->rings[i].entity.ptr_guilty = &ctx->guilty; /* kernel entity doesn't have ptr_guilty */
+		ctx->rings[i].entity.ptr_guilty = &ctx->guilty; /* kernel context/entity doesn't have ptr_guilty assigned*/
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5573792..0c51fb5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2619,6 +2619,7 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
 		if (!ring || !ring->sched.thread)
 			continue;
 
+		amd_sched_job_kickout_guilty(&ring->sched);
 		amd_sched_job_recovery(&ring->sched);
 		kthread_unpark(ring->sched.thread);
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 416908a..b999990 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -112,6 +112,7 @@ int amdgpu_prim_buf_per_se = 0;
 int amdgpu_pos_buf_per_se = 0;
 int amdgpu_cntl_sb_buf_per_se = 0;
 int amdgpu_param_buf_per_se = 0;
+int amdgpu_hang_limit = 0;
 
 MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes");
 module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
@@ -237,6 +238,8 @@ module_param_named(cntl_sb_buf_per_se, amdgpu_cntl_sb_buf_per_se, int, 0444);
 MODULE_PARM_DESC(param_buf_per_se, "the size of Off-Chip Pramater Cache per Shader Engine (default depending on gfx)");
 module_param_named(param_buf_per_se, amdgpu_param_buf_per_se, int, 0444);
 
+MODULE_PARM_DESC(hang_limit, "how many loops allow a job hang (default 0)");
+module_param_named(hang_limit, amdgpu_hang_limit, int ,0444);
 
 static const struct pci_device_id pciidlist[] = {
 #ifdef  CONFIG_DRM_AMDGPU_SI
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 208da11..0209c96 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -186,9 +186,22 @@ static struct fence *amdgpu_job_run(struct amd_sched_job *sched_job)
 	return fence;
 }
 
+static void amdgpu_invalidate_job(struct amd_sched_job *sched_job)
+{
+	struct amdgpu_job *job;
+
+	if (!sched_job || !sched_job->s_entity->ptr_guilty)
+		return;
+
+	job = to_amdgpu_job(sched_job);
+	if (atomic_inc_return(&job->karma) > amdgpu_hang_limit)
+		*sched_job->s_entity->ptr_guilty = true;
+}
+
 const struct amd_sched_backend_ops amdgpu_sched_ops = {
 	.dependency = amdgpu_job_dependency,
 	.run_job = amdgpu_job_run,
 	.timedout_job = amdgpu_job_timedout,
-	.free_job = amdgpu_job_free_cb
+	.free_job = amdgpu_job_free_cb,
+	.invalidate_job = amdgpu_invalidate_job,
 };
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 9100ca8..f671b1a 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -373,11 +373,87 @@ static void amd_sched_job_timedout(struct work_struct *work)
 	job->sched->ops->timedout_job(job);
 }
 
+static inline bool amd_sched_check_guilty(struct amd_sched_entity *entity)
+{
+	if (entity && entity->ptr_guilty != NULL)
+		return *entity->ptr_guilty;
+
+	/* if sched_job->s_entity->ptr_guilty == NULL, means this is a kernel entity job */
+	return false;
+}
+
+void amd_sched_job_kickout_guilty(struct amd_gpu_scheduler *sched)
+{
+	struct amd_sched_job *s_job, *s_tmp;
+	struct amd_sched_rq *rq;
+	struct list_head guilty_head;
+	int i;
+
+	INIT_LIST_HEAD(&guilty_head);
+	spin_lock(&sched->job_list_lock);
+	list_for_each_entry_safe(s_job, s_tmp, &sched->ring_mirror_list, node)
+		if (amd_sched_check_guilty(s_job->s_entity))
+			list_move(&s_job->node, &guilty_head);
+	spin_unlock(&sched->job_list_lock);
+
+	/* since free_job may cause wait/schedule, we'd better run it without spinlock
+	 * TODO: maybe we can just remove all spinlock protection in this routine becuase
+	 * this routine is invoked prior to job_recovery and kthread_unpark
+	 */
+	list_for_each_entry_safe(s_job, s_tmp, &guilty_head, node) {
+		/* the guilty job is fakely signaled, release the cs_wait on it
+		 *
+		 * TODO: we need to add more flags appended to FENCE_SIGNAL and
+		 * change behavior of fence_wait to indicate this fence's signal
+		 * is fake and due to gpu-reset, thus UMD will be acknowledged that CS_SUBMIT is
+		 * failed and its context is invalid.
+		 */
+
+		amd_sched_fence_finished(s_job->s_fence);
+		fence_put(&s_job->s_fence->finished);
+	}
+
+	/* Go through all entities and signal all jobs from the guilty */
+	for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++) {
+		struct amd_sched_entity *entity, *e_tmp;
+
+		if (i == AMD_SCHED_PRIORITY_KERNEL)
+			continue; /* kernel entity is always not gulity and can't be kickout */
+
+		rq = &sched->sched_rq[i];
+		spin_lock(&rq->lock);
+		list_for_each_entry_safe(entity, e_tmp, &rq->entities, list) {
+			struct amd_sched_job *guilty_job;
+
+			if (amd_sched_check_guilty(entity)) {
+				spin_lock(&entity->queue_lock);
+				while (!kfifo_is_empty(&entity->job_queue)) {
+					kfifo_out(&entity->job_queue, &guilty_job, sizeof(guilty_job));
+					spin_unlock(&entity->queue_lock);
+					amd_sched_fence_finished(guilty_job->s_fence);
+					fence_put(&guilty_job->s_fence->finished);
+					spin_lock(&entity->queue_lock);
+				}
+				spin_unlock(&entity->queue_lock);
+
+				list_del_init(&entity->list);
+				if (rq->current_entity == entity)
+					rq->current_entity = NULL;
+			}
+		}
+		spin_unlock(&rq->lock);
+	 }
+}
+
 void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched)
 {
-	struct amd_sched_job *s_job;
+	struct amd_sched_job *s_job, *first;
 
 	spin_lock(&sched->job_list_lock);
+	/* for the first job, consider it as guilty */
+	first = list_first_entry_or_null(&sched->ring_mirror_list,
+			struct amd_sched_job, node);
+
 	list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) {
 		if (s_job->s_fence->parent &&
 		    fence_remove_callback(s_job->s_fence->parent,
@@ -388,6 +464,9 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched)
 	}
 	atomic_set(&sched->hw_rq_count, 0);
 	spin_unlock(&sched->job_list_lock);
+
+	/* this will mark all entity behind this job's context as guilty */
+	sched->ops->invalidate_job(first);
 }
 
 void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index ccbbcb0..ab644a6 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -106,6 +106,7 @@ struct amd_sched_backend_ops {
 	struct fence *(*run_job)(struct amd_sched_job *sched_job);
 	void (*timedout_job)(struct amd_sched_job *sched_job);
 	void (*free_job)(struct amd_sched_job *sched_job);
+	void (*invalidate_job)(struct amd_sched_job *sched_job);
 };
 
 enum amd_sched_priority {
@@ -159,4 +160,5 @@ int amd_sched_job_init(struct amd_sched_job *job,
 		       void *owner);
 void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched);
 void amd_sched_job_recovery(struct amd_gpu_scheduler *sched);
+void amd_sched_job_kickout_guilty(struct amd_gpu_scheduler *sched);
 #endif
-- 
2.7.4