[RFC PATCH] drm/sched: Fix teardown leaks with refcounting

Philipp Stanner <pstanner@xxxxxxxxxx> · Tue, 3 Sep 2024 11:45:32 +0200

The GPU scheduler currently does not ensure that its pending_list is
empty before performing various other teardown tasks in
drm_sched_fini().

If there are still jobs in the pending_list, this is problematic because
after scheduler teardown, no one will call backend_ops.free_job()
anymore. This would, consequently, result in memory leaks.

One way to solves this is to implement reference counting for struct
drm_gpu_scheduler itself. Each job added to the pending_list takes a
reference, each one removed drops a reference.

This approach would keep the scheduler running even after users have
called drm_sched_fini(), and it would ultimately stop after the last job
has been removed from pending_list.

Add reference counting to struct drm_gpu_scheduler. Move the teardown
logic to __drm_sched_fini() and have drm_sched_fini() just also drop a
reference.

Suggested-by: Danilo Krummrich <dakr@xxxxxxxxxx>
Signed-off-by: Philipp Stanner <pstanner@xxxxxxxxxx>
---
Hi all,

since the scheduler has many stake holders, I want this solution
discussed as an RFC first.

The advantage of this version would be that it does not block
drm_sched_fini(), but the price paid for that is that the scheduler
keeps running until all jobs are gone.

Cheers,
P.
---
 drivers/gpu/drm/scheduler/sched_main.c | 17 ++++++++++++++++-
 include/drm/gpu_scheduler.h            |  5 +++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 7e90c9f95611..62b453c8ed76 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -99,6 +99,8 @@ int drm_sched_policy = DRM_SCHED_POLICY_FIFO;
 MODULE_PARM_DESC(sched_policy, "Specify the scheduling policy for entities on a run-queue, " __stringify(DRM_SCHED_POLICY_RR) " = Round Robin, " __stringify(DRM_SCHED_POLICY_FIFO) " = FIFO (default).");
 module_param_named(sched_policy, drm_sched_policy, int, 0444);
 
+static void __drm_sched_fini(struct kref *jobs_remaining);
+
 static u32 drm_sched_available_credits(struct drm_gpu_scheduler *sched)
 {
 	u32 credits;
@@ -540,6 +542,7 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job)
 
 	spin_lock(&sched->job_list_lock);
 	list_add_tail(&s_job->list, &sched->pending_list);
+	kref_get(&sched->jobs_remaining);
 	drm_sched_start_timeout(sched);
 	spin_unlock(&sched->job_list_lock);
 }
@@ -564,6 +567,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
 		 * is parked at which point it's safe.
 		 */
 		list_del_init(&job->list);
+		kref_put(&sched->jobs_remaining, __drm_sched_fini);
 		spin_unlock(&sched->job_list_lock);
 
 		status = job->sched->ops->timedout_job(job);
@@ -637,6 +641,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
 			 */
 			spin_lock(&sched->job_list_lock);
 			list_del_init(&s_job->list);
+			kref_put(&sched->jobs_remaining, __drm_sched_fini);
 			spin_unlock(&sched->job_list_lock);
 
 			/*
@@ -1084,6 +1089,7 @@ drm_sched_get_finished_job(struct drm_gpu_scheduler *sched)
 	if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
 		/* remove job from pending_list */
 		list_del_init(&job->list);
+		kref_put(&sched->jobs_remaining, __drm_sched_fini);
 
 		/* cancel this job's TO timer */
 		cancel_delayed_work(&sched->work_tdr);
@@ -1303,6 +1309,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
 	init_waitqueue_head(&sched->job_scheduled);
 	INIT_LIST_HEAD(&sched->pending_list);
 	spin_lock_init(&sched->job_list_lock);
+	kref_init(&sched->jobs_remaining);
 	atomic_set(&sched->credit_count, 0);
 	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
 	INIT_WORK(&sched->work_run_job, drm_sched_run_job_work);
@@ -1334,11 +1341,14 @@ EXPORT_SYMBOL(drm_sched_init);
  *
  * Tears down and cleans up the scheduler.
  */
-void drm_sched_fini(struct drm_gpu_scheduler *sched)
+static void __drm_sched_fini(struct kref *jobs_remaining)
 {
+	struct drm_gpu_scheduler *sched;
 	struct drm_sched_entity *s_entity;
 	int i;
 
+	sched = container_of(jobs_remaining, struct drm_gpu_scheduler, jobs_remaining);
+
 	drm_sched_wqueue_stop(sched);
 
 	for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
@@ -1368,6 +1378,11 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
 	kfree(sched->sched_rq);
 	sched->sched_rq = NULL;
 }
+
+void drm_sched_fini(struct drm_gpu_scheduler *sched)
+{
+	kref_put(&sched->jobs_remaining, __drm_sched_fini);
+}
 EXPORT_SYMBOL(drm_sched_fini);
 
 /**
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 5acc64954a88..b7fad8294861 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -29,6 +29,7 @@
 #include <linux/completion.h>
 #include <linux/xarray.h>
 #include <linux/workqueue.h>
+#include <linux/kref.h>
 
 #define MAX_WAIT_SCHED_ENTITY_Q_EMPTY msecs_to_jiffies(1000)
 
@@ -495,6 +496,9 @@ struct drm_sched_backend_ops {
  *                 waits on this wait queue until all the scheduled jobs are
  *                 finished.
  * @job_id_count: used to assign unique id to the each job.
+ * @jobs_remaining: submitted jobs take a refcount of the scheduler to prevent it
+ *		    from terminating before the last job has been removed from
+ *		    @pending_list.
  * @submit_wq: workqueue used to queue @work_run_job and @work_free_job
  * @timeout_wq: workqueue used to queue @work_tdr
  * @work_run_job: work which calls run_job op of each scheduler.
@@ -525,6 +529,7 @@ struct drm_gpu_scheduler {
 	struct drm_sched_rq             **sched_rq;
 	wait_queue_head_t		job_scheduled;
 	atomic64_t			job_id_count;
+	struct kref			jobs_remaining;
 	struct workqueue_struct		*submit_wq;
 	struct workqueue_struct		*timeout_wq;
 	struct work_struct		work_run_job;
-- 
2.46.0