Re: [PATCH v2] drm/amdgpu: Fix the looply call svm_range_restore_pages issue

Felix Kuehling <felix.kuehling@xxxxxxx> · Fri, 3 Jan 2025 18:18:07 -0500

On 2025-01-02 21:26, Emily Deng wrote:
As the delayed free pt, the wanted freed bo has been reused which will cause
unexpected page fault, and then call svm_range_restore_pages.

Detail as below:
1.It wants to free the pt in follow code, but it is not freed immediately
and used “schedule_work(&vm->pt_free_work);”.

[   92.276838] Call Trace:
[   92.276841]  dump_stack+0x63/0xa0
[   92.276887]  amdgpu_vm_pt_free_list+0xfb/0x120 [amdgpu]
[   92.276932]  amdgpu_vm_update_range+0x69c/0x8e0 [amdgpu]
[   92.276990]  svm_range_unmap_from_gpus+0x112/0x310 [amdgpu]
[   92.277046]  svm_range_cpu_invalidate_pagetables+0x725/0x780 [amdgpu]
[   92.277050]  ? __alloc_pages_nodemask+0x19f/0x3e0
[   92.277051]  mn_itree_invalidate+0x72/0xc0
[   92.277052]  __mmu_notifier_invalidate_range_start+0x48/0x60
[   92.277054]  migrate_vma_collect+0xf6/0x100
[   92.277055]  migrate_vma_setup+0xcf/0x120
[   92.277109]  svm_migrate_ram_to_vram+0x256/0x6b0 [amdgpu]

2.Call svm_range_map_to_gpu->amdgpu_vm_update_range to update the page
table, at this time it will use the same entry bo which is the want free
bo in step1.

3.Then it executes the pt_free_work to free the bo. At this time, the GPU
access memory will cause page fault as pt bo has been freed. And then it will
call svm_range_restore_pages again.

How to fix?
Add a workqueue, and flush the workqueue each time before updating page table.

I think this is kind of a known issue in the GPUVM code. Philip was 
looking at it before.

Just flushing a workqueue may seem like a simple and elegant solution, 
but I'm afraid it introduces lock dependency issues. By flushing the 
workqueue, you're effectively creating a lock dependency of the MMU 
notifier on any locks held inside the worker function. You now get a 
circular lock dependency with any of those locks and memory reclaim. I 
think LOCKDEP would be able to catch that if you compile your kernel 
with that feature enabled.

The proper solution is to prevent delayed freeing of page tables if they 
happened to get reused, or prevent reuse of page tables if they are 
flagged for delayed freeing.

Regards,
  Felix



Signed-off-by: Emily Deng <Emily.Deng@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h              | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c           | 7 +++++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c        | 6 +++++-
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c             | 3 +++
  5 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 93c352b08969..cbf68ad1c8d0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1188,6 +1188,7 @@ struct amdgpu_device {
  	struct mutex                    enforce_isolation_mutex;
  
  	struct amdgpu_init_level *init_lvl;
+	struct workqueue_struct *wq;
  };
  
  static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index f30548f4c3b3..5b4835bc81b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2069,6 +2069,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
  		if (ret)
  			goto out;
  	}
+	flush_workqueue(adev->wq);
  
  	ret = reserve_bo_and_vm(mem, avm, &ctx);
  	if (unlikely(ret))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 9d6ffe38b48a..500d97cd9114 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2607,7 +2607,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
  	amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);
  
  	flush_work(&vm->pt_free_work);
-
+	cancel_work_sync(&vm->pt_free_work);
  	root = amdgpu_bo_ref(vm->root.bo);
  	amdgpu_bo_reserve(root, true);
  	amdgpu_vm_put_task_info(vm->task_info);
@@ -2708,6 +2708,8 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev)
  #endif
  
  	xa_init_flags(&adev->vm_manager.pasids, XA_FLAGS_LOCK_IRQ);
+	adev->wq = alloc_workqueue("amdgpu_recycle",
+				   WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 16);
  }
  
  /**
@@ -2721,7 +2723,8 @@ void amdgpu_vm_manager_fini(struct amdgpu_device *adev)
  {
  	WARN_ON(!xa_empty(&adev->vm_manager.pasids));
  	xa_destroy(&adev->vm_manager.pasids);
-
+	flush_workqueue(adev->wq);
+	destroy_workqueue(adev->wq);
  	amdgpu_vmid_mgr_fini(adev);
  }
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index f78a0434a48f..1204406215ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -554,15 +554,19 @@ void amdgpu_vm_pt_free_work(struct work_struct *work)
  
  	vm = container_of(work, struct amdgpu_vm, pt_free_work);
  
+	printk("Emily:%s\n", __func__);
  	spin_lock(&vm->status_lock);
  	list_splice_init(&vm->pt_freed, &pt_freed);
  	spin_unlock(&vm->status_lock);
+	printk("Emily:%s 1\n", __func__);
  
  	/* flush_work in amdgpu_vm_fini ensure vm->root.bo is valid. */
  	amdgpu_bo_reserve(vm->root.bo, true);
+	printk("Emily:%s 2\n", __func__);
  
  	list_for_each_entry_safe(entry, next, &pt_freed, vm_status)
  		amdgpu_vm_pt_free(entry);
+	printk("Emily:%s 3\n", __func__);
  
  	amdgpu_bo_unreserve(vm->root.bo);
  }
@@ -589,7 +593,7 @@ void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
  		spin_lock(&vm->status_lock);
  		list_splice_init(&params->tlb_flush_waitlist, &vm->pt_freed);
  		spin_unlock(&vm->status_lock);
-		schedule_work(&vm->pt_free_work);
+		queue_work(adev->wq, &vm->pt_free_work);
  		return;
  	}
  
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 3e2911895c74..55edf96d5a95 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1314,6 +1314,7 @@ svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm,
  	uint64_t init_pte_value = 0;
  
  	pr_debug("[0x%llx 0x%llx]\n", start, last);
+	flush_workqueue(adev->wq);
  
  	return amdgpu_vm_update_range(adev, vm, false, true, true, false, NULL, start,
  				      last, init_pte_value, 0, 0, NULL, NULL,
@@ -1422,6 +1423,8 @@ svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange,
  		 * different memory partition based on fpfn/lpfn, we should use
  		 * same vm_manager.vram_base_offset regardless memory partition.
  		 */
+		flush_workqueue(adev->wq);
+
  		r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb, true,
  					   NULL, last_start, prange->start + i,
  					   pte_flags,