DMABuf imports in compute VMs are not wrapped in a kgd_mem object on the
process_info->kfd_bo_list. There is no explicit KFD API call to validate
them or add eviction fences to them.
This patch automatically validates and fences dymanic DMABuf imports
when
they are added to a compute VM. Revalidation after evictions is handled
in the VM code.
v2:
* Renamed amdgpu_vm_validate_evicted_bos to amdgpu_vm_validate
* Eliminated evicted_user state, use evicted state for VM BOs and
user BOs
* Fixed and simplified amdgpu_vm_fence_imports, depends on reserved BOs
* Moved dma_resv_reserve_fences for amdgpu_vm_fence_imports into
amdgpu_vm_validate, outside the vm->status_lock
* Added dummy version of amdgpu_amdkfd_bo_validate_and_fence for builds
without KFD
Signed-off-by: Felix Kuehling <Felix.Kuehling@xxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 10 ++
.../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 44 +++++---
drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 6 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 4 +
drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 28 ++++-
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 101 ++++++++++++++++--
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 10 +-
drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 10 +-
8 files changed, 177 insertions(+), 36 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index f262b9d89541..584a0cea5572 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -191,6 +191,9 @@ struct amdgpu_amdkfd_fence
*to_amdgpu_amdkfd_fence(struct dma_fence *f);
int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo);
int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni,
unsigned long cur_seq, struct kgd_mem *mem);
+int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
+ uint32_t domain,
+ struct dma_fence *fence);
#else
static inline
bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm)
@@ -216,6 +219,13 @@ int amdgpu_amdkfd_evict_userptr(struct
mmu_interval_notifier *mni,
{
return 0;
}
+static inline
+int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
+ uint32_t domain,
+ struct dma_fence *fence)
+{
+ return 0;
+}
#endif
/* Shared API */
int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device *adev, size_t
size,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 5f445d856769..fbabb1e63a67 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -426,9 +426,9 @@ static int amdgpu_amdkfd_bo_validate(struct
amdgpu_bo *bo, uint32_t domain,
return ret;
}
-static int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
- uint32_t domain,
- struct dma_fence *fence)
+int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
+ uint32_t domain,
+ struct dma_fence *fence)
{
int ret = amdgpu_bo_reserve(bo, false);
@@ -464,13 +464,15 @@ static int amdgpu_amdkfd_validate_vm_bo(void
*_unused, struct amdgpu_bo *bo)
* again. Page directories are only updated after updating page
* tables.
*/
-static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm)
+static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm,
+ struct ww_acquire_ctx *ticket)
{
struct amdgpu_bo *pd = vm->root.bo;
struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev);
int ret;
- ret = amdgpu_vm_validate_pt_bos(adev, vm,
amdgpu_amdkfd_validate_vm_bo, NULL);
+ ret = amdgpu_vm_validate(adev, vm, ticket,
+ amdgpu_amdkfd_validate_vm_bo, NULL);
if (ret) {
pr_err("failed to validate PT BOs\n");
return ret;
@@ -1310,14 +1312,15 @@ static int map_bo_to_gpuvm(struct kgd_mem *mem,
return ret;
}
-static int process_validate_vms(struct amdkfd_process_info
*process_info)
+static int process_validate_vms(struct amdkfd_process_info
*process_info,
+ struct ww_acquire_ctx *ticket)
{
struct amdgpu_vm *peer_vm;
int ret;
list_for_each_entry(peer_vm, &process_info->vm_list_head,
vm_list_node) {
- ret = vm_validate_pt_pd_bos(peer_vm);
+ ret = vm_validate_pt_pd_bos(peer_vm, ticket);
if (ret)
return ret;
}
@@ -1402,7 +1405,7 @@ static int init_kfd_vm(struct amdgpu_vm *vm,
void **process_info,
ret = amdgpu_bo_reserve(vm->root.bo, true);
if (ret)
goto reserve_pd_fail;
- ret = vm_validate_pt_pd_bos(vm);
+ ret = vm_validate_pt_pd_bos(vm, NULL);
if (ret) {
pr_err("validate_pt_pd_bos() failed\n");
goto validate_pd_fail;
@@ -2043,7 +2046,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
bo->tbo.resource->mem_type == TTM_PL_SYSTEM)
is_invalid_userptr = true;
- ret = vm_validate_pt_pd_bos(avm);
+ ret = vm_validate_pt_pd_bos(avm, NULL);
if (unlikely(ret))
goto out_unreserve;
@@ -2122,7 +2125,7 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
goto unreserve_out;
}
- ret = vm_validate_pt_pd_bos(avm);
+ ret = vm_validate_pt_pd_bos(avm, NULL);
if (unlikely(ret))
goto unreserve_out;
@@ -2620,7 +2623,7 @@ static int validate_invalid_user_pages(struct
amdkfd_process_info *process_info)
}
}
- ret = process_validate_vms(process_info);
+ ret = process_validate_vms(process_info, NULL);
if (ret)
goto unreserve_out;
@@ -2880,11 +2883,6 @@ int
amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence
__rcu *
amdgpu_sync_create(&sync_obj);
- /* Validate PDs and PTs */
- ret = process_validate_vms(process_info);
- if (ret)
- goto validate_map_fail;
-
/* Validate BOs and map them to GPUVM (update VM page tables). */
list_for_each_entry(mem, &process_info->kfd_bo_list,
validate_list) {
@@ -2935,6 +2933,13 @@ int
amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence
__rcu *
if (failed_size)
pr_debug("0x%lx/0x%lx in system\n", failed_size, total_size);
+ /* Validate PDs, PTs and evicted DMABuf imports last.
Otherwise BO
+ * validations above would invalidate DMABuf imports again.
+ */
+ ret = process_validate_vms(process_info, &exec.ticket);
+ if (ret)
+ goto validate_map_fail;
+
/* Update mappings not managed by KFD */
list_for_each_entry(peer_vm, &process_info->vm_list_head,
vm_list_node) {
@@ -3006,7 +3011,7 @@ int
amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence
__rcu *
&process_info->eviction_fence->base,
DMA_RESV_USAGE_BOOKKEEP);
}
- /* Attach eviction fence to PD / PT BOs */
+ /* Attach eviction fence to PD / PT BOs and DMABuf imports */
list_for_each_entry(peer_vm, &process_info->vm_list_head,
vm_list_node) {
struct amdgpu_bo *bo = peer_vm->root.bo;
@@ -3014,6 +3019,11 @@ int
amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence
__rcu *
dma_resv_add_fence(bo->tbo.base.resv,
&process_info->eviction_fence->base,
DMA_RESV_USAGE_BOOKKEEP);
+
+ ret = amdgpu_vm_fence_imports(peer_vm, &exec.ticket,
+ &process_info->eviction_fence->base);
+ if (ret)
+ break;
}
validate_map_fail:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index e50be6500030..dc7fac778417 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -952,10 +952,10 @@ static int amdgpu_cs_parser_bos(struct
amdgpu_cs_parser *p,
p->bytes_moved = 0;
p->bytes_moved_vis = 0;
- r = amdgpu_vm_validate_pt_bos(p->adev, &fpriv->vm,
- amdgpu_cs_bo_validate, p);
+ r = amdgpu_vm_validate(p->adev, &fpriv->vm, NULL,
+ amdgpu_cs_bo_validate, p);
if (r) {
- DRM_ERROR("amdgpu_vm_validate_pt_bos() failed.\n");
+ DRM_ERROR("amdgpu_vm_validate() failed.\n");
goto out_free_user_pages;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
index decbbe3d4f06..055ba2ea4c12 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
@@ -377,6 +377,10 @@ amdgpu_dma_buf_move_notify(struct
dma_buf_attachment *attach)
struct amdgpu_vm_bo_base *bo_base;
int r;
+ /* FIXME: This should be after the "if", but needs a fix to
make sure
+ * DMABuf imports are initialized in the right VM list.
+ */
+ amdgpu_vm_bo_invalidate(adev, bo, false);
if (!bo->tbo.resource || bo->tbo.resource->mem_type ==
TTM_PL_SYSTEM)
return;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index b1ce22a9b186..68bab2879696 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -187,7 +187,33 @@ static int amdgpu_gem_object_open(struct
drm_gem_object *obj,
else
++bo_va->ref_count;
amdgpu_bo_unreserve(abo);
- return 0;
+
+ /* Validate and add eviction fence to DMABuf imports with dynamic
+ * attachment in compute VMs. Re-validation will be done by
+ * amdgpu_vm_validate and the fence will be updated by
+ * amdgpu_vm_fence_imports in
amdgpu_amdkfd_gpuvm_restore_process_bos.
+ *
+ * Nested locking below for the case that a GEM object is opened in
+ * kfd_mem_export_dmabuf. Since the lock below is only taken for
imports,
+ * but not for export, this is a different lock class that
cannot lead to
+ * circular lock dependencies.
+ */
+ if (!vm->is_compute_context || !vm->process_info)
+ return 0;
+ if (!obj->import_attach ||
+ !dma_buf_is_dynamic(obj->import_attach->dmabuf))
+ return 0;
+ mutex_lock_nested(&vm->process_info->lock, 1);
+ if (!WARN_ON(!vm->process_info->eviction_fence)) {
+ r = amdgpu_amdkfd_bo_validate_and_fence(abo,
AMDGPU_GEM_DOMAIN_GTT,
+ &vm->process_info->eviction_fence->base);
+ if (r)
+ dev_warn(adev->dev, "%d: validate_and_fence failed: %d\n",
+ vm->task_info.pid, r);
+ }
+ mutex_unlock(&vm->process_info->lock);
+
+ return r;
}
static void amdgpu_gem_object_close(struct drm_gem_object *obj,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 7da71b6a9dc6..ab2662ab4ab8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -426,24 +426,29 @@ uint64_t amdgpu_vm_generation(struct
amdgpu_device *adev, struct amdgpu_vm *vm)
}
/**
- * amdgpu_vm_validate_pt_bos - validate the page table BOs
+ * amdgpu_vm_validate - validate evicted BOs tracked in the VM
*
* @adev: amdgpu device pointer
* @vm: vm providing the BOs
+ * @ticket: optional reservation ticket used to reserve the VM
* @validate: callback to do the validation
* @param: parameter for the validation callback
*
- * Validate the page table BOs on command submission if neccessary.
+ * Validate the page table BOs and per-VM BOs on command submission if
+ * necessary. If a ticket is given, also try to validate evicted
user queue
+ * BOs. They must already be reserved with the given ticket.
*
* Returns:
* Validation result.
*/
-int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct
amdgpu_vm *vm,
- int (*validate)(void *p, struct amdgpu_bo *bo),
- void *param)
+int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm
*vm,
+ struct ww_acquire_ctx *ticket,
+ int (*validate)(void *p, struct amdgpu_bo *bo),
+ void *param)
{
struct amdgpu_vm_bo_base *bo_base;
struct amdgpu_bo *shadow;
+ struct dma_resv *resv;
struct amdgpu_bo *bo;
int r;
@@ -464,8 +469,27 @@ int amdgpu_vm_validate_pt_bos(struct
amdgpu_device *adev, struct amdgpu_vm *vm,
spin_unlock(&vm->status_lock);
bo = bo_base->bo;
+ resv = bo->tbo.base.resv;
shadow = amdgpu_bo_shadowed(bo);
+ if (resv != vm->root.bo->tbo.base.resv) {
+ if (!ticket) {
+ /* We need to move the BO out of the evicted
+ * list to avoid an infinite loop. It will be
+ * moved back to evicted in the next
+ * amdgpu_vm_handle_moved.
+ */
+ amdgpu_vm_bo_invalidated(bo_base);
+ spin_lock(&vm->status_lock);
+ continue;
+ }
+ if (dma_resv_locking_ctx(resv) != ticket) {
+ pr_warn_ratelimited("Evicted user BO is not reserved
in pid %d\n",
+ vm->task_info.pid);