[RFC PATCH] drm/amdkfd: Run restore_workers on freezable WQs

Felix Kuehling <Felix.Kuehling@xxxxxxx> · Fri, 27 Oct 2023 18:39:11 -0400

Make restore workers freezable so we don't have to explicitly flush them
in suspend and GPU reset code paths, and we don't accidentally try to
restore BOs while the GPU is suspended. Not having to flush restore_work
also helps avoid lock/fence dependencies in the GPU reset case where we're
not allowed to wait for fences.

This is an RFC and request for testing. I have not tested this myself yet.
My notes below:

Restore work won't be frozen during GPU reset. Does it matter? Queues will
stay evicted until resume in any case. But restore_work may be in trouble
if it gets run in the middle of a GPU reset. In that case, if anything
fails, it will just reschedule itself, so should be fine as long as it
doesn't interfere with the reset itself (respects any mechanisms in place
to prevent HW access during the reset).

What HW access does restore_work perform:
- migrating buffers: uses GPU scheduler, should be suspended during reset
- TLB flushes in userptr restore worker: not called directly, relies on
  HWS to flush TLBs on VMID attachment
- TLB flushes in SVM restore worker: Does TLB flush in the mapping code
- Resuming user mode queues: should not happen while GPU reset keeps queue
  eviction counter elevated
Ergo: Except for the SVM case, it's OK to not flush restore work before
GPU resets. I'll need to rethink the interaction of SVM restore_work with
GPU resets.

What about cancelling p->eviction_work? Eviction work would normally be
needed to signal eviction fences, but we're doing that explicitly in
suspend_all_processes. Does eviction_work wait for fences anywhere? Yes,
indirectly by flushing restore_work. So we should not try to cancel it
during a GPU reset.

Problem: accessing p->ef concurrently in evict_process_worker and
suspend_all_processes. Need a spinlock to use and update it safely.
Problem: What if evict_process_worker gets stuck in flushing restore_work?
We can skip all of that if p->ef is NULL (which it is during the reset).
Even if it gets stuck, it's no problem if the reset doesn't depend on it.
It should get unstuck after the reset.

Signed-off-by: Felix Kuehling <Felix.Kuehling@xxxxxxx>
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  9 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 49 +++++++++++++------
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c          |  4 +-
 4 files changed, 44 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 54f31a420229..89e632257663 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1644,7 +1644,8 @@ int amdgpu_amdkfd_criu_resume(void *p)
 		goto out_unlock;
 	}
 	WRITE_ONCE(pinfo->block_mmu_notifications, false);
-	schedule_delayed_work(&pinfo->restore_userptr_work, 0);
+	queue_delayed_work(system_freezable_wq,
+			   &pinfo->restore_userptr_work, 0);
 
 out_unlock:
 	mutex_unlock(&pinfo->lock);
@@ -2458,7 +2459,8 @@ int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni,
 				       KFD_QUEUE_EVICTION_TRIGGER_USERPTR);
 		if (r)
 			pr_err("Failed to quiesce KFD\n");
-		schedule_delayed_work(&process_info->restore_userptr_work,
+		queue_delayed_work(system_freezable_wq,
+			&process_info->restore_userptr_work,
 			msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
 	}
 	mutex_unlock(&process_info->notifier_lock);
@@ -2793,7 +2795,8 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
 
 	/* If validation failed, reschedule another attempt */
 	if (evicted_bos) {
-		schedule_delayed_work(&process_info->restore_userptr_work,
+		queue_delayed_work(system_freezable_wq,
+			&process_info->restore_userptr_work,
 			msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
 
 		kfd_smi_event_queue_restore_rescheduled(mm);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 9cc32f577e38..cf017d027fee 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -919,6 +919,7 @@ struct kfd_process {
 	 * during restore
 	 */
 	struct dma_fence *ef;
+	spinlock_t ef_lock;
 
 	/* Work items for evicting and restoring BOs */
 	struct delayed_work eviction_work;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index fbf053001af9..a07cba58ec5e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -664,7 +664,8 @@ int kfd_process_create_wq(void)
 	if (!kfd_process_wq)
 		kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0);
 	if (!kfd_restore_wq)
-		kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq", 0);
+		kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq",
+							 WQ_FREEZABLE);
 
 	if (!kfd_process_wq || !kfd_restore_wq) {
 		kfd_process_destroy_wq();
@@ -1460,6 +1461,7 @@ static struct kfd_process *create_process(const struct task_struct *thread)
 
 	kref_init(&process->ref);
 	mutex_init(&process->mutex);
+	spin_lock_init(&process->ef_lock);
 	process->mm = thread->mm;
 	process->lead_thread = thread->group_leader;
 	process->n_pdds = 0;
@@ -1904,6 +1906,19 @@ kfd_process_gpuid_from_node(struct kfd_process *p, struct kfd_node *node,
 	return -EINVAL;
 }
 
+static void signal_eviction_fence(struct kfd_process *p)
+{
+	spin_lock(&p->ef_lock);
+	if (!p->ef)
+		goto unlock_out;
+	dma_fence_signal(p->ef);
+	dma_fence_put(p->ef);
+	WRITE_ONCE(p->ef, NULL);
+
+unlock_out:
+	spin_unlock(&p->ef_lock);
+}
+
 static void evict_process_worker(struct work_struct *work)
 {
 	int ret;
@@ -1916,8 +1931,11 @@ static void evict_process_worker(struct work_struct *work)
 	 * lifetime of this thread, kfd_process p will be valid
 	 */
 	p = container_of(dwork, struct kfd_process, eviction_work);
-	WARN_ONCE(p->last_eviction_seqno != p->ef->seqno,
-		  "Eviction fence mismatch\n");
+	/* If the eviction fence is not valid, we're probably in a suspend
+	 * or GPU reset cycle. There is nothing to do in this case.
+	 */
+	if (!READ_ONCE(p->ef))
+		return;
 
 	/* Narrow window of overlap between restore and evict work
 	 * item is possible. Once amdgpu_amdkfd_gpuvm_restore_process_bos
@@ -1930,9 +1948,7 @@ static void evict_process_worker(struct work_struct *work)
 	pr_debug("Started evicting pasid 0x%x\n", p->pasid);
 	ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_TTM);
 	if (!ret) {
-		dma_fence_signal(p->ef);
-		dma_fence_put(p->ef);
-		p->ef = NULL;
+		signal_eviction_fence(p);
 		queue_delayed_work(kfd_restore_wq, &p->restore_work,
 				msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));
 
@@ -1967,9 +1983,19 @@ static void restore_process_worker(struct work_struct *work)
 
 	p->last_restore_timestamp = get_jiffies_64();
 	/* VMs may not have been acquired yet during debugging. */
-	if (p->kgd_process_info)
+	if (p->kgd_process_info) {
+		struct dma_fence *ef = NULL;
+
 		ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
-							     &p->ef);
+							     &ef);
+		if (!ret) {
+			spin_lock(&p->ef_lock);
+			WARN_ONCE(p->ef, "Eviction fence is not NULL");
+			WRITE_ONCE(p->ef, ef);
+			spin_unlock(&p->ef_lock);
+		}
+	}
+
 	if (ret) {
 		pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n",
 			 p->pasid, PROCESS_BACK_OFF_TIME_MS);
@@ -1994,14 +2020,9 @@ void kfd_suspend_all_processes(void)
 
 	WARN(debug_evictions, "Evicting all processes");
 	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
-		cancel_delayed_work_sync(&p->eviction_work);
-		flush_delayed_work(&p->restore_work);
-
 		if (kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
 			pr_err("Failed to suspend process 0x%x\n", p->pasid);
-		dma_fence_signal(p->ef);
-		dma_fence_put(p->ef);
-		p->ef = NULL;
+		signal_eviction_fence(p);
 	}
 	srcu_read_unlock(&kfd_processes_srcu, idx);
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index fe937670c927..aa6c34127be9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1869,7 +1869,7 @@ static void svm_range_restore_work(struct work_struct *work)
 	/* If validation failed, reschedule another attempt */
 	if (evicted_ranges) {
 		pr_debug("reschedule to restore svm range\n");
-		schedule_delayed_work(&svms->restore_work,
+		queue_delayed_work(system_freezable_wq, &svms->restore_work,
 			msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
 
 		kfd_smi_event_queue_restore_rescheduled(mm);
@@ -1945,7 +1945,7 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
 			pr_debug("failed to quiesce KFD\n");
 
 		pr_debug("schedule to restore svm %p ranges\n", svms);
-		schedule_delayed_work(&svms->restore_work,
+		queue_delayed_work(system_freezable_wq, &svms->restore_work,
 			msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
 	} else {
 		unsigned long s, l;
-- 
2.34.1