SVM migration unmap pages from GPU and then update mapping to GPU to
recover page fault. Currently unmap clears the PDE entry for range
length >= huge page and may free PTB bo. Then update mapping should
alloc new PT bo, but there is race bug that the freed entry bo maybe
still on the pt_free list, reused when updating mapping and then freed,
leave invalid PDE entry and cause GPU page fault.
By setting the update fragment size to 2MB or 1GB, update will clear
only one PDE entry or clear PTB, to avoid unmap to free PTE bo. This
fixes the race bug and also improve the unmap and map to GPU
performance. Update mapping to huge page will still free the PTB bo.
With this change, the vm->pt_freed list and work is not needed. Add
WARN_ON(unlocked) in amdgpu_vm_pt_add_list to catch if unmap to free the
PTB.
v2: Limit update fragment size, not hack entry_end (Christian)
Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 --
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 4 --
drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 54 +++++++++--------------
3 files changed, 21 insertions(+), 41 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index c9c48b782ec1..48b2c0b3b315 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2440,8 +2440,6 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
spin_lock_init(&vm->status_lock);
INIT_LIST_HEAD(&vm->freed);
INIT_LIST_HEAD(&vm->done);
- INIT_LIST_HEAD(&vm->pt_freed);
- INIT_WORK(&vm->pt_free_work, amdgpu_vm_pt_free_work);
INIT_KFIFO(vm->faults);
r = amdgpu_vm_init_entities(adev, vm);
@@ -2613,8 +2611,6 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);
- flush_work(&vm->pt_free_work);
-
root = amdgpu_bo_ref(vm->root.bo);
amdgpu_bo_reserve(root, true);
amdgpu_vm_put_task_info(vm->task_info);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 5d119ac26c4f..160889e5e64d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -369,10 +369,6 @@ struct amdgpu_vm {
/* BOs which are invalidated, has been updated in the PTs */
struct list_head done;
- /* PT BOs scheduled to free and fill with zero if vm_resv is not hold */
- struct list_head pt_freed;
- struct work_struct pt_free_work;
-
/* contains the page directory */
struct amdgpu_vm_bo_base root;
struct dma_fence *last_update;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index f78a0434a48f..063d0e0a9f29 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -546,27 +546,6 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry)
amdgpu_bo_unref(&entry->bo);
}
-void amdgpu_vm_pt_free_work(struct work_struct *work)
-{
- struct amdgpu_vm_bo_base *entry, *next;
- struct amdgpu_vm *vm;
- LIST_HEAD(pt_freed);
-
- vm = container_of(work, struct amdgpu_vm, pt_free_work);
-
- spin_lock(&vm->status_lock);
- list_splice_init(&vm->pt_freed, &pt_freed);
- spin_unlock(&vm->status_lock);
-
- /* flush_work in amdgpu_vm_fini ensure vm->root.bo is valid. */
- amdgpu_bo_reserve(vm->root.bo, true);
-
- list_for_each_entry_safe(entry, next, &pt_freed, vm_status)
- amdgpu_vm_pt_free(entry);
-
- amdgpu_bo_unreserve(vm->root.bo);
-}
-
/**
* amdgpu_vm_pt_free_list - free PD/PT levels
*
@@ -579,20 +558,9 @@ void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
struct amdgpu_vm_update_params *params)
{
struct amdgpu_vm_bo_base *entry, *next;
- struct amdgpu_vm *vm = params->vm;
- bool unlocked = params->unlocked;
if (list_empty(¶ms->tlb_flush_waitlist))
return;
-
- if (unlocked) {
- spin_lock(&vm->status_lock);
- list_splice_init(¶ms->tlb_flush_waitlist, &vm->pt_freed);
- spin_unlock(&vm->status_lock);
- schedule_work(&vm->pt_free_work);
- return;
- }
-
list_for_each_entry_safe(entry, next, ¶ms->tlb_flush_waitlist, vm_status)
amdgpu_vm_pt_free(entry);
}
@@ -611,6 +579,11 @@ static void amdgpu_vm_pt_add_list(struct amdgpu_vm_update_params *params,
struct amdgpu_vm_pt_cursor seek;
struct amdgpu_vm_bo_base *entry;
+ /*
+ * unlocked unmap only clear page table leaves, warning to free the page table entry.
+ */
+ WARN_ON(params->unlocked);
+
spin_lock(¶ms->vm->status_lock);
for_each_amdgpu_vm_pt_dfs_safe(params->adev, params->vm, cursor, seek, entry) {
if (entry && entry->bo)
@@ -794,6 +767,21 @@ static void amdgpu_vm_pte_fragment(struct amdgpu_vm_update_params *params,
/* This intentionally wraps around if no bit is set */
*frag = min_t(unsigned int, ffs(start) - 1, fls64(end - start) - 1);
+
+ /*
+ * MMU notifier callback unlocked unmap only clear PDE or PTE leaves by setting fragment
+ * size to 2MB, 1GB, 512GB. If leave is PDE entry, only clear one entry, next fragment entry
+ * will search again for PDE or PTE leaves.
+ */
+ if (params->unlocked) {
+ if (*frag > 9 && *frag < 18)
+ *frag = 9;
+ else if (*frag > 18 && *frag < 27)
+ *frag = 18;
+ else if (*frag > 27)
+ *frag = 27;
+ }
+