[AMD Official Use Only - General] > -----Original Message----- > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Felix > Kuehling > Sent: Tuesday, November 7, 2023 11:58 AM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Chen, Xiaogang <Xiaogang.Chen@xxxxxxx>; Errabolu, Ramesh > <Ramesh.Errabolu@xxxxxxx>; Koenig, Christian > <Christian.Koenig@xxxxxxx> > Subject: [PATCH 3/6] drm/amdgpu: Auto-validate DMABuf imports in > compute VMs > > Caution: This message originated from an External Source. Use proper caution > when opening attachments, clicking links, or responding. > > > DMABuf imports in compute VMs are not wrapped in a kgd_mem object on > the process_info->kfd_bo_list. There is no explicit KFD API call to validate them > or add eviction fences to them. > > This patch automatically validates and fences dymanic DMABuf imports when > they are added to a compute VM. Revalidation after evictions is handled in the > VM code. > > Signed-off-by: Felix Kuehling <Felix.Kuehling@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 3 + > .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 15 ++- > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 6 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 26 ++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 117 > +++++++++++++++++- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 6 +- > 7 files changed, 164 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > index fcf8a98ad15e..68d534a89942 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > @@ -178,6 +178,9 @@ int > amdgpu_queue_mask_bit_to_set_resource_bit(struct amdgpu_device *adev, > struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context, > struct mm_struct *mm, > struct svm_range_bo *svm_bo); > +int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo, > + uint32_t domain, > + struct dma_fence *fence); > #if defined(CONFIG_DEBUG_FS) > int kfd_debugfs_kfd_mem_limits(struct seq_file *m, void *data); #endif diff -- > git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > index 2e302956a279..0c1cb6048259 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > @@ -423,9 +423,9 @@ static int amdgpu_amdkfd_bo_validate(struct > amdgpu_bo *bo, uint32_t domain, > return ret; > } > > -static int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo, > - uint32_t domain, > - struct dma_fence *fence) > +int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo, > + uint32_t domain, > + struct dma_fence *fence) > { > int ret = amdgpu_bo_reserve(bo, false); > > @@ -2948,7 +2948,7 @@ int > amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence > **ef) > struct amdgpu_device *adev = amdgpu_ttm_adev( > peer_vm->root.bo->tbo.bdev); > > - ret = amdgpu_vm_handle_moved(adev, peer_vm, &ctx.ticket); > + ret = amdgpu_vm_handle_moved(adev, peer_vm, &ctx.ticket, > + true); > if (ret) { > pr_debug("Memory eviction: handle moved failed. Try again\n"); > goto validate_map_fail; @@ -3001,7 +3001,7 @@ int > amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence > **ef) > &process_info->eviction_fence->base, > DMA_RESV_USAGE_BOOKKEEP); > } > - /* Attach eviction fence to PD / PT BOs */ > + /* Attach eviction fence to PD / PT BOs and DMABuf imports */ > list_for_each_entry(peer_vm, &process_info->vm_list_head, > vm_list_node) { > struct amdgpu_bo *bo = peer_vm->root.bo; @@ -3009,6 +3009,11 > @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct > dma_fence **ef) > dma_resv_add_fence(bo->tbo.base.resv, > &process_info->eviction_fence->base, > DMA_RESV_USAGE_BOOKKEEP); > + > + ret = amdgpu_vm_fence_imports(peer_vm, &ctx.ticket, > + &process_info->eviction_fence->base); > + if (ret) > + break; > } > > validate_map_fail: > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > index aafedb344c1b..20f4be8cd635 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > @@ -1165,7 +1165,7 @@ static int amdgpu_cs_vm_handling(struct > amdgpu_cs_parser *p) > return r; > } > > - r = amdgpu_vm_handle_moved(adev, vm, &p->ticket); > + r = amdgpu_vm_handle_moved(adev, vm, &p->ticket, false); > if (r) > return r; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c > index e7e87a3b2601..234244704f27 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c > @@ -373,6 +373,10 @@ amdgpu_dma_buf_move_notify(struct > dma_buf_attachment *attach) > struct amdgpu_vm_bo_base *bo_base; > int r; > > + /* FIXME: This should be after the "if", but needs a fix to make sure > + * DMABuf imports are initialized in the right VM list. > + */ > + amdgpu_vm_bo_invalidate(adev, bo, false); > if (!bo->tbo.resource || bo->tbo.resource->mem_type == > TTM_PL_SYSTEM) > return; > > @@ -409,7 +413,7 @@ amdgpu_dma_buf_move_notify(struct > dma_buf_attachment *attach) > if (!r) > r = amdgpu_vm_clear_freed(adev, vm, NULL); > if (!r) > - r = amdgpu_vm_handle_moved(adev, vm, ticket); > + r = amdgpu_vm_handle_moved(adev, vm, ticket, > + false); > > if (r && r != -EBUSY) > DRM_ERROR("Failed to invalidate VM page tables (%d))\n", diff -- > git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c > index 849fffbb367d..755cc3c559f5 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c > @@ -186,6 +186,32 @@ static int amdgpu_gem_object_open(struct > drm_gem_object *obj, > else > ++bo_va->ref_count; > amdgpu_bo_unreserve(abo); > + > + /* Validate and add eviction fence to DMABuf imports with dymanic dymanic -> dynamic > + * attachment in compute VMs. Re-validation will be done by > + * amdgpu_vm_handle_moved and the fence will be updated by > + * amdgpu_vm_fence_imports in > amdgpu_amdkfd_gpuvm_restore_process_bos. > + * > + * Nested locking below for the case that a GEM object is opened in > + * kfd_mem_export_dmabuf. Since the lock below is only taken for > imports, > + * but not for export, this is a different lock class that cannot lead to > + * circular lock dependencies. > + */ > + if (!vm->is_compute_context || !vm->process_info) > + return 0; > + if (!obj->import_attach || > + !dma_buf_is_dynamic(obj->import_attach->dmabuf)) > + return 0; > + mutex_lock_nested(&vm->process_info->lock, 1); > + if (!WARN_ON(!vm->process_info->eviction_fence)) { > + r = amdgpu_amdkfd_bo_validate_and_fence(abo, > AMDGPU_GEM_DOMAIN_GTT, > + &vm->process_info->eviction_fence->base); > + if (r) > + dev_warn(adev->dev, "%d: validate_and_fence failed: %d\n", > + vm->task_info.pid, r); > + } > + mutex_unlock(&vm->process_info->lock); > + > return 0; > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index 0d685577243c..b2c7449ab561 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -1307,6 +1307,7 @@ int amdgpu_vm_clear_freed(struct > amdgpu_device *adev, > * @adev: amdgpu_device pointer > * @vm: requested vm > * @ticket: optional reservation ticket used to reserve the VM > + * @valitate: whether to auto-validate invalid DMABuf imports valitate -> validate > * > * Make sure all BOs which are moved are updated in the PTs. > * > @@ -1317,7 +1318,8 @@ int amdgpu_vm_clear_freed(struct > amdgpu_device *adev, > */ > int amdgpu_vm_handle_moved(struct amdgpu_device *adev, > struct amdgpu_vm *vm, > - struct ww_acquire_ctx *ticket) > + struct ww_acquire_ctx *ticket, > + bool validate) > { > struct amdgpu_bo_va *bo_va; > struct dma_resv *resv; > @@ -1337,6 +1339,12 @@ int amdgpu_vm_handle_moved(struct > amdgpu_device *adev, > spin_lock(&vm->status_lock); > } > > + /* If we're validating user BOs, splice all evicted user BOs into > + * the list of invalid BOs for revalidation > + */ > + if (validate) > + list_splice_init(&vm->evicted_user, &vm->invalidated); > + > while (!list_empty(&vm->invalidated)) { > bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va, > base.vm_status); @@ -1357,17 +1365,120 @@ int > amdgpu_vm_handle_moved(struct amdgpu_device *adev, > unlock = false; > } > > + /* Automatically validate DMABuf imports in compute VMs, if we > + * have a reservation, or remember them for later validation. > + */ > + if (vm->is_compute_context && bo_va->base.bo && > + bo_va->base.bo->tbo.base.import_attach && > + (!bo_va->base.bo->tbo.resource || > + bo_va->base.bo->tbo.resource->mem_type == TTM_PL_SYSTEM)) > { > + struct ttm_operation_ctx ctx = { true, false }; > + struct amdgpu_bo *bo = bo_va->base.bo; > + > + if (!validate) { > + r = amdgpu_vm_bo_update(adev, bo_va, clear); > + if (!r) > + amdgpu_vm_bo_evicted_user(&bo_va->base); > + goto unlock; > + } > + > + if (clear) { > + pr_warn_ratelimited("Invalid DMABuf import is busy in pid > %d\n", vm->task_info.pid); > + r = -EBUSY; > + goto unlock; > + } > + > + amdgpu_bo_placement_from_domain(bo, > + bo->preferred_domains); > + r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); > + if (r) > + goto unlock; > + r = amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, > + true); > + if (r) > + goto unlock; > + } > + > r = amdgpu_vm_bo_update(adev, bo_va, clear); > +unlock: > + if (unlock) > + dma_resv_unlock(resv); > if (r) > return r; > + spin_lock(&vm->status_lock); > + } > + spin_unlock(&vm->status_lock); > + > + return 0; > +} > + > +/** > + * amdgpu_vm_fence_imports - add fence to valid DMABuf imports > + * > + * @vm: requested vm > + * @ticket: optional reservation ticket used to reserve the VM > + * @fence: fence to add > + * > + * Add the specified fence to all dymanic DMABuf imports that are valid. > + * > + * Returns: > + * 0 for success. > + */ > +int amdgpu_vm_fence_imports(struct amdgpu_vm *vm, > + struct ww_acquire_ctx *ticket, > + struct dma_fence *fence) { > + struct amdgpu_bo_va *bo_va, *tmp; > + struct dma_resv *resv; > + LIST_HEAD(imports); > + bool unlock; > + int ret = 0; > + > + if (!vm->is_compute_context) > + return 0; > + > + /* Move all the DMABuf imports to a private list so I can reserve > + * them while not holding te status_lock. > + */ > + spin_lock(&vm->status_lock); > + list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status) { > + if (bo_va->base.bo && bo_va->base.bo->tbo.base.import_attach && > + dma_buf_is_dynamic(bo_va->base.bo->tbo.base.import_attach- > >dmabuf)) > + list_move(&bo_va->base.vm_status, &imports); > + } > + spin_unlock(&vm->status_lock); > + > + list_for_each_entry(bo_va, &imports, base.vm_status) { > + resv = bo_va->base.bo->tbo.base.resv; > + > + /* Try to reserve the BO */ > + if (dma_resv_trylock(resv)) { > + unlock = true; > + /* The caller is already holding the reservation lock */ > + } else if (ticket && dma_resv_locking_ctx(resv) == ticket) { > + unlock = false; > + } else { > + WARN_ONCE(1, "Failed to reserve DMABuf import"); > + ret = -EBUSY; > + break; > + } > + > + ret = dma_resv_reserve_fences(resv, 1); > + if (!ret) > + dma_resv_add_fence(resv, fence, > + DMA_RESV_USAGE_BOOKKEEP); > > if (unlock) > dma_resv_unlock(resv); > - spin_lock(&vm->status_lock); > + if (ret) > + break; > } > + > + spin_lock(&vm->status_lock); > + list_splice(&imports, &vm->idle); > spin_unlock(&vm->status_lock); > > - return 0; > + return ret; > } > > /** > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > index 939d0c2219c0..2db04b8fef97 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > @@ -443,7 +443,11 @@ int amdgpu_vm_clear_freed(struct amdgpu_device > *adev, > struct dma_fence **fence); int > amdgpu_vm_handle_moved(struct amdgpu_device *adev, > struct amdgpu_vm *vm, > - struct ww_acquire_ctx *ticket); > + struct ww_acquire_ctx *ticket, > + bool validate); int > +amdgpu_vm_fence_imports(struct amdgpu_vm *vm, > + struct ww_acquire_ctx *ticket, > + struct dma_fence *fence); > int amdgpu_vm_flush_compute_tlb(struct amdgpu_device *adev, > struct amdgpu_vm *vm, > uint32_t flush_type, > -- > 2.34.1