On 2025-02-20 06:59, Emily Deng wrote:
Call amdgpu_amdkfd_reserve_mem_limit in svm_range_vram_node_new when
creating a new SVM BO. Call amdgpu_amdkfd_unreserve_mem_limit
in svm_range_bo_release when the SVM BO is deleted.
Signed-off-by: Emily Deng <Emily.Deng@xxxxxxx>
---
drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 13 -------------
drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 20 ++++++++++++++++++++
2 files changed, 20 insertions(+), 13 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 79251f22b702..3bbc69751f7e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -514,15 +514,6 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc,
start = start_mgr << PAGE_SHIFT;
end = (last_mgr + 1) << PAGE_SHIFT;
- r = amdgpu_amdkfd_reserve_mem_limit(node->adev,
- prange->npages * PAGE_SIZE,
- KFD_IOC_ALLOC_MEM_FLAGS_VRAM,
- node->xcp ? node->xcp->id : 0);
- if (r) {
- dev_dbg(node->adev->dev, "failed to reserve VRAM, r: %ld\n", r);
- return -ENOSPC;
- }
-
From git history, this is to "make sure there is enough available VRAM
and migrating to VRAM doesn't evict
other possible non-unified memory BOs.", without this, it will
trigger unexpected OOM killer.
We should keep this for xnack on.
r = svm_range_vram_node_new(node, prange, true);
if (r) {
dev_dbg(node->adev->dev, "fail %ld to alloc vram\n", r);
@@ -560,10 +551,6 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc,
}
out:
- amdgpu_amdkfd_unreserve_mem_limit(node->adev,
- prange->npages * PAGE_SIZE,
- KFD_IOC_ALLOC_MEM_FLAGS_VRAM,
- node->xcp ? node->xcp->id : 0);
Keep this for xnack on.
return r < 0 ? r : 0;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index db3034b00dac..c861d8c90419 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -430,6 +430,10 @@ static void svm_range_bo_release(struct kref *kref)
/* We're not in the eviction worker. Signal the fence. */
dma_fence_signal(&svm_bo->eviction_fence->base);
dma_fence_put(&svm_bo->eviction_fence->base);
+ amdgpu_amdkfd_unreserve_mem_limit(svm_bo->node->adev,
+ svm_bo->bo->tbo.base.size,
+ KFD_IOC_ALLOC_MEM_FLAGS_VRAM,
+ svm_bo->node->xcp ? svm_bo->node->xcp->id : 0);
amdgpu_bo_unref(&svm_bo->bo);
kfree(svm_bo);
}
@@ -581,6 +585,18 @@ svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange,
kfree(svm_bo);
return -ESRCH;
}
+
+ r = amdgpu_amdkfd_reserve_mem_limit(node->adev,
+ prange->npages * PAGE_SIZE,
+ KFD_IOC_ALLOC_MEM_FLAGS_VRAM,
+ node->xcp ? node->xcp->id : 0);
+ if (r) {
+ pr_debug("failed to reserve VRAM, r: %ld\n", r);
+ mmput(mm);
+ kfree(svm_bo);
+ return -ENOSPC;
+ }
+
The reserve VRAM limit is only for xnack off, as xnack on VRAM over
commit should work.
Regards,
Philip
svm_bo->node = node;
svm_bo->eviction_fence =
amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1),
@@ -655,6 +671,10 @@ svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange,
reserve_bo_failed:
amdgpu_bo_unref(&bo);
create_bo_failed:
+ amdgpu_amdkfd_unreserve_mem_limit(svm_bo->node->adev,
+ prange->npages * PAGE_SIZE,
+ KFD_IOC_ALLOC_MEM_FLAGS_VRAM,
+ node->xcp ? node->xcp->id : 0);
dma_fence_put(&svm_bo->eviction_fence->base);
kfree(svm_bo);
prange->ttm_res = NULL;