DMABuf imports in compute VMs are not wrapped in a kgd_mem object on the process_info->kfd_bo_list. There is no explicit KFD API call to validate them or add eviction fences to them. This patch automatically validates and fences dymanic DMABuf imports when they are added to a compute VM. Revalidation after evictions is handled in the VM code. Signed-off-by: Felix Kuehling <Felix.Kuehling@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 3 + .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 15 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 6 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 26 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 117 +++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 6 +- 7 files changed, 164 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index fcf8a98ad15e..68d534a89942 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -178,6 +178,9 @@ int amdgpu_queue_mask_bit_to_set_resource_bit(struct amdgpu_device *adev, struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context, struct mm_struct *mm, struct svm_range_bo *svm_bo); +int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo, + uint32_t domain, + struct dma_fence *fence); #if defined(CONFIG_DEBUG_FS) int kfd_debugfs_kfd_mem_limits(struct seq_file *m, void *data); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 2e302956a279..0c1cb6048259 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -423,9 +423,9 @@ static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain, return ret; } -static int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo, - uint32_t domain, - struct dma_fence *fence) +int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo, + uint32_t domain, + struct dma_fence *fence) { int ret = amdgpu_bo_reserve(bo, false); @@ -2948,7 +2948,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef) struct amdgpu_device *adev = amdgpu_ttm_adev( peer_vm->root.bo->tbo.bdev); - ret = amdgpu_vm_handle_moved(adev, peer_vm, &ctx.ticket); + ret = amdgpu_vm_handle_moved(adev, peer_vm, &ctx.ticket, true); if (ret) { pr_debug("Memory eviction: handle moved failed. Try again\n"); goto validate_map_fail; @@ -3001,7 +3001,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef) &process_info->eviction_fence->base, DMA_RESV_USAGE_BOOKKEEP); } - /* Attach eviction fence to PD / PT BOs */ + /* Attach eviction fence to PD / PT BOs and DMABuf imports */ list_for_each_entry(peer_vm, &process_info->vm_list_head, vm_list_node) { struct amdgpu_bo *bo = peer_vm->root.bo; @@ -3009,6 +3009,11 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef) dma_resv_add_fence(bo->tbo.base.resv, &process_info->eviction_fence->base, DMA_RESV_USAGE_BOOKKEEP); + + ret = amdgpu_vm_fence_imports(peer_vm, &ctx.ticket, + &process_info->eviction_fence->base); + if (ret) + break; } validate_map_fail: diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index aafedb344c1b..20f4be8cd635 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1165,7 +1165,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) return r; } - r = amdgpu_vm_handle_moved(adev, vm, &p->ticket); + r = amdgpu_vm_handle_moved(adev, vm, &p->ticket, false); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c index e7e87a3b2601..234244704f27 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c @@ -373,6 +373,10 @@ amdgpu_dma_buf_move_notify(struct dma_buf_attachment *attach) struct amdgpu_vm_bo_base *bo_base; int r; + /* FIXME: This should be after the "if", but needs a fix to make sure + * DMABuf imports are initialized in the right VM list. + */ + amdgpu_vm_bo_invalidate(adev, bo, false); if (!bo->tbo.resource || bo->tbo.resource->mem_type == TTM_PL_SYSTEM) return; @@ -409,7 +413,7 @@ amdgpu_dma_buf_move_notify(struct dma_buf_attachment *attach) if (!r) r = amdgpu_vm_clear_freed(adev, vm, NULL); if (!r) - r = amdgpu_vm_handle_moved(adev, vm, ticket); + r = amdgpu_vm_handle_moved(adev, vm, ticket, false); if (r && r != -EBUSY) DRM_ERROR("Failed to invalidate VM page tables (%d))\n", diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index 849fffbb367d..755cc3c559f5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -186,6 +186,32 @@ static int amdgpu_gem_object_open(struct drm_gem_object *obj, else ++bo_va->ref_count; amdgpu_bo_unreserve(abo); + + /* Validate and add eviction fence to DMABuf imports with dymanic + * attachment in compute VMs. Re-validation will be done by + * amdgpu_vm_handle_moved and the fence will be updated by + * amdgpu_vm_fence_imports in amdgpu_amdkfd_gpuvm_restore_process_bos. + * + * Nested locking below for the case that a GEM object is opened in + * kfd_mem_export_dmabuf. Since the lock below is only taken for imports, + * but not for export, this is a different lock class that cannot lead to + * circular lock dependencies. + */ + if (!vm->is_compute_context || !vm->process_info) + return 0; + if (!obj->import_attach || + !dma_buf_is_dynamic(obj->import_attach->dmabuf)) + return 0; + mutex_lock_nested(&vm->process_info->lock, 1); + if (!WARN_ON(!vm->process_info->eviction_fence)) { + r = amdgpu_amdkfd_bo_validate_and_fence(abo, AMDGPU_GEM_DOMAIN_GTT, + &vm->process_info->eviction_fence->base); + if (r) + dev_warn(adev->dev, "%d: validate_and_fence failed: %d\n", + vm->task_info.pid, r); + } + mutex_unlock(&vm->process_info->lock); + return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 0d685577243c..b2c7449ab561 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -1307,6 +1307,7 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev, * @adev: amdgpu_device pointer * @vm: requested vm * @ticket: optional reservation ticket used to reserve the VM + * @valitate: whether to auto-validate invalid DMABuf imports * * Make sure all BOs which are moved are updated in the PTs. * @@ -1317,7 +1318,8 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev, */ int amdgpu_vm_handle_moved(struct amdgpu_device *adev, struct amdgpu_vm *vm, - struct ww_acquire_ctx *ticket) + struct ww_acquire_ctx *ticket, + bool validate) { struct amdgpu_bo_va *bo_va; struct dma_resv *resv; @@ -1337,6 +1339,12 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev, spin_lock(&vm->status_lock); } + /* If we're validating user BOs, splice all evicted user BOs into + * the list of invalid BOs for revalidation + */ + if (validate) + list_splice_init(&vm->evicted_user, &vm->invalidated); + while (!list_empty(&vm->invalidated)) { bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va, base.vm_status); @@ -1357,17 +1365,120 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev, unlock = false; } + /* Automatically validate DMABuf imports in compute VMs, if we + * have a reservation, or remember them for later validation. + */ + if (vm->is_compute_context && bo_va->base.bo && + bo_va->base.bo->tbo.base.import_attach && + (!bo_va->base.bo->tbo.resource || + bo_va->base.bo->tbo.resource->mem_type == TTM_PL_SYSTEM)) { + struct ttm_operation_ctx ctx = { true, false }; + struct amdgpu_bo *bo = bo_va->base.bo; + + if (!validate) { + r = amdgpu_vm_bo_update(adev, bo_va, clear); + if (!r) + amdgpu_vm_bo_evicted_user(&bo_va->base); + goto unlock; + } + + if (clear) { + pr_warn_ratelimited("Invalid DMABuf import is busy in pid %d\n", vm->task_info.pid); + r = -EBUSY; + goto unlock; + } + + amdgpu_bo_placement_from_domain(bo, + bo->preferred_domains); + r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); + if (r) + goto unlock; + r = amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, + true); + if (r) + goto unlock; + } + r = amdgpu_vm_bo_update(adev, bo_va, clear); +unlock: + if (unlock) + dma_resv_unlock(resv); if (r) return r; + spin_lock(&vm->status_lock); + } + spin_unlock(&vm->status_lock); + + return 0; +} + +/** + * amdgpu_vm_fence_imports - add fence to valid DMABuf imports + * + * @vm: requested vm + * @ticket: optional reservation ticket used to reserve the VM + * @fence: fence to add + * + * Add the specified fence to all dymanic DMABuf imports that are valid. + * + * Returns: + * 0 for success. + */ +int amdgpu_vm_fence_imports(struct amdgpu_vm *vm, + struct ww_acquire_ctx *ticket, + struct dma_fence *fence) +{ + struct amdgpu_bo_va *bo_va, *tmp; + struct dma_resv *resv; + LIST_HEAD(imports); + bool unlock; + int ret = 0; + + if (!vm->is_compute_context) + return 0; + + /* Move all the DMABuf imports to a private list so I can reserve + * them while not holding te status_lock. + */ + spin_lock(&vm->status_lock); + list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status) { + if (bo_va->base.bo && bo_va->base.bo->tbo.base.import_attach && + dma_buf_is_dynamic(bo_va->base.bo->tbo.base.import_attach->dmabuf)) + list_move(&bo_va->base.vm_status, &imports); + } + spin_unlock(&vm->status_lock); + + list_for_each_entry(bo_va, &imports, base.vm_status) { + resv = bo_va->base.bo->tbo.base.resv; + + /* Try to reserve the BO */ + if (dma_resv_trylock(resv)) { + unlock = true; + /* The caller is already holding the reservation lock */ + } else if (ticket && dma_resv_locking_ctx(resv) == ticket) { + unlock = false; + } else { + WARN_ONCE(1, "Failed to reserve DMABuf import"); + ret = -EBUSY; + break; + } + + ret = dma_resv_reserve_fences(resv, 1); + if (!ret) + dma_resv_add_fence(resv, fence, + DMA_RESV_USAGE_BOOKKEEP); if (unlock) dma_resv_unlock(resv); - spin_lock(&vm->status_lock); + if (ret) + break; } + + spin_lock(&vm->status_lock); + list_splice(&imports, &vm->idle); spin_unlock(&vm->status_lock); - return 0; + return ret; } /** diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 939d0c2219c0..2db04b8fef97 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -443,7 +443,11 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev, struct dma_fence **fence); int amdgpu_vm_handle_moved(struct amdgpu_device *adev, struct amdgpu_vm *vm, - struct ww_acquire_ctx *ticket); + struct ww_acquire_ctx *ticket, + bool validate); +int amdgpu_vm_fence_imports(struct amdgpu_vm *vm, + struct ww_acquire_ctx *ticket, + struct dma_fence *fence); int amdgpu_vm_flush_compute_tlb(struct amdgpu_device *adev, struct amdgpu_vm *vm, uint32_t flush_type, -- 2.34.1