No need to trigger eviction as the memory mapping will not be used anymore. All pt/pd bos share same resv, hence the same shared eviction fence. Everytime page table is freed, the fence will be signled and that cuases kfd unexcepted evictions. Signed-off-by: xinhui pan <xinhui.pan@xxxxxxxxxxx> --- change from v3: fix a coding error change from v2: based on Chris' drm/ttm: rework BO delayed delete patchset. --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 + .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 36 +++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 1 + drivers/gpu/drm/ttm/ttm_bo.c | 16 +++++---- 5 files changed, 49 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 47b0f2957d1f..265b1ed7264c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -96,6 +96,7 @@ struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context, struct mm_struct *mm); bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm); struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f); +int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo); struct amdkfd_process_info { /* List head of all VMs that belong to a KFD process */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index ef721cb65868..d4b117065c1e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -276,6 +276,41 @@ static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo, return 0; } +int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo) +{ + struct amdgpu_bo *root = bo; + struct amdgpu_vm_bo_base *vm_bo; + struct amdgpu_vm *vm; + struct amdkfd_process_info *info; + struct amdgpu_amdkfd_fence *ef; + int ret; + + while (root->parent) + root = root->parent; + + vm_bo = root->vm_bo; + if (!vm_bo) + return 0; + + vm = vm_bo->vm; + if (!vm) + return 0; + + info = vm->process_info; + if (!info || !info->eviction_fence) + return 0; + + ef = container_of(dma_fence_get(&info->eviction_fence->base), + struct amdgpu_amdkfd_fence, base); + + BUG_ON(!dma_resv_trylock(&bo->tbo.base._resv)); + ret = amdgpu_amdkfd_remove_eviction_fence(bo, ef); + dma_resv_unlock(&bo->tbo.base._resv); + + dma_fence_put(&ef->base); + return ret; +} + static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain, bool wait) { @@ -1051,6 +1086,7 @@ void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev, WARN_ON(!list_empty(&process_info->userptr_valid_list)); WARN_ON(!list_empty(&process_info->userptr_inval_list)); + vm->process_info = NULL; dma_fence_put(&process_info->eviction_fence->base); cancel_delayed_work_sync(&process_info->restore_userptr_work); put_pid(process_info->pid); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 6f60a581e3ba..3784d178c965 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -1307,6 +1307,8 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo) if (abo->kfd_bo) amdgpu_amdkfd_unreserve_memory_limit(abo); + amdgpu_amdkfd_remove_fence_on_pt_pd_bos(abo); + if (bo->mem.mem_type != TTM_PL_VRAM || !bo->mem.mm_node || !(abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE)) return; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 247f328b7223..eca4ec66c1ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -3109,6 +3109,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) } amdgpu_vm_free_pts(adev, vm, NULL); + root->vm_bo = NULL; amdgpu_bo_unreserve(root); amdgpu_bo_unref(&root); WARN_ON(vm->root.base.bo); diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 6c3cea509e25..855d3566381e 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -399,8 +399,7 @@ static int ttm_bo_individualize_resv(struct ttm_buffer_object *bo) BUG_ON(!dma_resv_trylock(&bo->base._resv)); r = dma_resv_copy_fences(&bo->base._resv, bo->base.resv); - if (r) - dma_resv_unlock(&bo->base._resv); + dma_resv_unlock(&bo->base._resv); return r; } @@ -565,9 +564,6 @@ static void ttm_bo_release(struct kref *kref) int ret; if (!bo->deleted) { - if (bo->bdev->driver->release_notify) - bo->bdev->driver->release_notify(bo); - drm_vma_offset_remove(bdev->vma_manager, &bo->base.vma_node); ttm_mem_io_lock(man, false); ttm_mem_io_free_vm(bo); @@ -581,6 +577,14 @@ static void ttm_bo_release(struct kref *kref) dma_resv_wait_timeout_rcu(bo->base.resv, true, false, 30 * HZ); } + + spin_lock(&ttm_bo_glob.lru_lock); + if (bo->type != ttm_bo_type_sg) + bo->base.resv = &bo->base._resv; + spin_unlock(&ttm_bo_glob.lru_lock); + + if (bo->bdev->driver->release_notify) + bo->bdev->driver->release_notify(bo); } if (!dma_resv_test_signaled_rcu(bo->base.resv, true)) { @@ -599,8 +603,6 @@ static void ttm_bo_release(struct kref *kref) } spin_lock(&ttm_bo_glob.lru_lock); - if (bo->type != ttm_bo_type_sg) - bo->base.resv = &bo->base._resv; kref_init(&bo->kref); list_add_tail(&bo->ddestroy, &bdev->ddestroy); spin_unlock(&ttm_bo_glob.lru_lock); -- 2.17.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx