On 2025-01-29 10:28, Christian König wrote: > Remove all KFD BOs from the private dma_resv object. > > This prevents the KFD from being evitec unecessarily when an exported BO > is released. > > Signed-off-by: Christian König <christian.koenig@xxxxxxx> Looks good to me. Assuming James doesn't find any issues in testing, the series is Reviewed-by: Felix Kuehling <felix.kuehling@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 5 +- > .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 52 ++++++++----------- > drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 38 ++++++++------ > 3 files changed, 47 insertions(+), 48 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > index 4b80ad860639..62917f76da33 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > @@ -192,7 +192,7 @@ int kfd_debugfs_kfd_mem_limits(struct seq_file *m, void *data); > #if IS_ENABLED(CONFIG_HSA_AMD) > bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm); > struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f); > -int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo); > +void amdgpu_amdkfd_remove_all_eviction_fences(struct amdgpu_bo *bo); > int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni, > unsigned long cur_seq, struct kgd_mem *mem); > int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo, > @@ -212,9 +212,8 @@ struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f) > } > > static inline > -int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo) > +void amdgpu_amdkfd_remove_all_eviction_fences(struct amdgpu_bo *bo) > { > - return 0; > } > > static inline > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > index f30548f4c3b3..609b27fe1cda 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > @@ -370,40 +370,32 @@ static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo, > return 0; > } > > -int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo) > +/** > + * amdgpu_amdkfd_remove_all_eviction_fences - Remove all eviction fences > + * @bo: the BO where to remove the evictions fences from. > + * > + * This functions should only be used on release when all references to the BO > + * are already dropped. We remove the eviction fence from the private copy of > + * the dma_resv object here since that is what is used during release to > + * determine of the BO is idle or not. > + */ > +void amdgpu_amdkfd_remove_all_eviction_fences(struct amdgpu_bo *bo) > { > - struct amdgpu_bo *root = bo; > - struct amdgpu_vm_bo_base *vm_bo; > - struct amdgpu_vm *vm; > - struct amdkfd_process_info *info; > - struct amdgpu_amdkfd_fence *ef; > - int ret; > - > - /* we can always get vm_bo from root PD bo.*/ > - while (root->parent) > - root = root->parent; > + struct dma_resv *resv = &bo->tbo.base._resv; > + struct dma_fence *fence, *stub; > + struct dma_resv_iter cursor; > > - vm_bo = root->vm_bo; > - if (!vm_bo) > - return 0; > + dma_resv_assert_held(resv); > > - vm = vm_bo->vm; > - if (!vm) > - return 0; > - > - info = vm->process_info; > - if (!info || !info->eviction_fence) > - return 0; > - > - ef = container_of(dma_fence_get(&info->eviction_fence->base), > - struct amdgpu_amdkfd_fence, base); > - > - BUG_ON(!dma_resv_trylock(bo->tbo.base.resv)); > - ret = amdgpu_amdkfd_remove_eviction_fence(bo, ef); > - dma_resv_unlock(bo->tbo.base.resv); > + stub = dma_fence_get_stub(); > + dma_resv_for_each_fence(&cursor, resv, DMA_RESV_USAGE_BOOKKEEP, fence) { > + if (!to_amdgpu_amdkfd_fence(fence)) > + continue; > > - dma_fence_put(&ef->base); > - return ret; > + dma_resv_replace_fences(resv, fence->context, stub, > + DMA_RESV_USAGE_BOOKKEEP); > + } > + dma_fence_put(stub); > } > > static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain, > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > index fc94b8b9b86d..d12be7a1eb6e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > @@ -1194,28 +1194,36 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo) > if (abo->kfd_bo) > amdgpu_amdkfd_release_notify(abo); > > - /* We only remove the fence if the resv has individualized. */ > - WARN_ON_ONCE(bo->type == ttm_bo_type_kernel > - && bo->base.resv != &bo->base._resv); > - if (bo->base.resv == &bo->base._resv) > - amdgpu_amdkfd_remove_fence_on_pt_pd_bos(abo); > + /* > + * We lock the private dma_resv object here and since the BO is about to > + * be released nobody else should have a pointer to it. > + * So when this locking here fails something is wrong with the reference > + * counting. > + */ > + if (WARN_ON_ONCE(!dma_resv_trylock(&bo->base._resv))) > + return; > + > + amdgpu_amdkfd_remove_all_eviction_fences(abo); > > if (!bo->resource || bo->resource->mem_type != TTM_PL_VRAM || > !(abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE) || > adev->in_suspend || drm_dev_is_unplugged(adev_to_drm(adev))) > - return; > + goto out; > > - if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv))) > - return; > + r = dma_resv_reserve_fences(&bo->base._resv, 1); > + if (r) > + goto out; > > - r = amdgpu_fill_buffer(abo, 0, bo->base.resv, &fence, true); > - if (!WARN_ON(r)) { > - amdgpu_vram_mgr_set_cleared(bo->resource); > - amdgpu_bo_fence(abo, fence, false); > - dma_fence_put(fence); > - } > + r = amdgpu_fill_buffer(abo, 0, &bo->base._resv, &fence, true); > + if (WARN_ON(r)) > + goto out; > + > + amdgpu_vram_mgr_set_cleared(bo->resource); > + dma_resv_add_fence(&bo->base._resv, fence, DMA_RESV_USAGE_KERNEL); > + dma_fence_put(fence); > > - dma_resv_unlock(bo->base.resv); > +out: > + dma_resv_unlock(&bo->base._resv); > } > > /**