Following error conditions are fixed: Unpin MMIO and DOORBELL BOs only after map count goes to zero Remove BO from validate list of a KFD process in a safe manner Print a warning message if unreserving GPUVMs encounters an error Signed-off-by: Ramesh Errabolu <Ramesh.Errabolu@xxxxxxx> --- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 42 +++++++++++-------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index a1de900ba677..ee48e6591f99 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1013,14 +1013,22 @@ static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem, mutex_unlock(&process_info->lock); } -static void remove_kgd_mem_from_kfd_bo_list(struct kgd_mem *mem, +/** + * remove_kgd_mem_from_validate_list() - Remove BO from process's validate list, + * in an idempotent manner, so that restore worker can't access it anymore + * @mem: BO's membership handle in validate list + * @process_info: KFD process handle to which BO belongs + * + * Return: void + */ +static void remove_kgd_mem_from_validate_list(struct kgd_mem *mem, struct amdkfd_process_info *process_info) { struct ttm_validate_buffer *bo_list_entry; bo_list_entry = &mem->validate_list; mutex_lock(&process_info->lock); - list_del(&bo_list_entry->head); + list_del_init(&bo_list_entry->head); mutex_unlock(&process_info->lock); } @@ -1796,7 +1804,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( allocate_init_user_pages_failed: err_pin_bo: - remove_kgd_mem_from_kfd_bo_list(*mem, avm->process_info); + remove_kgd_mem_from_validate_list(*mem, avm->process_info); drm_vma_node_revoke(&gobj->vma_node, drm_priv); err_node_allow: /* Don't unreserve system mem limit twice */ @@ -1825,20 +1833,12 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( unsigned long bo_size = mem->bo->tbo.base.size; struct kfd_mem_attachment *entry, *tmp; struct bo_vm_reservation_context ctx; - struct ttm_validate_buffer *bo_list_entry; unsigned int mapped_to_gpu_memory; int ret; bool is_imported = false; mutex_lock(&mem->lock); - /* Unpin MMIO/DOORBELL BO's that were pinned during allocation */ - if (mem->alloc_flags & - (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL | - KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) { - amdgpu_amdkfd_gpuvm_unpin_bo(mem->bo); - } - mapped_to_gpu_memory = mem->mapped_to_gpu_memory; is_imported = mem->is_imported; mutex_unlock(&mem->lock); @@ -1853,10 +1853,7 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( } /* Make sure restore workers don't access the BO any more */ - bo_list_entry = &mem->validate_list; - mutex_lock(&process_info->lock); - list_del(&bo_list_entry->head); - mutex_unlock(&process_info->lock); + remove_kgd_mem_from_validate_list(mem, process_info); /* No more MMU notifiers */ amdgpu_mn_unregister(mem->bo); @@ -1878,7 +1875,18 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( list_for_each_entry_safe(entry, tmp, &mem->attachments, list) kfd_mem_detach(entry); + /* Return success even in case of error */ ret = unreserve_bo_and_vms(&ctx, false, false); + if (unlikely(ret)) { + WARN_ONCE(ret, "Error in unreserving BO and associated VMs"); + ret = 0; + } + + /* Unpin MMIO/DOORBELL BO's that were pinned during allocation */ + if (mem->alloc_flags & + (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL | + KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) + amdgpu_amdkfd_gpuvm_unpin_bo(mem->bo); /* Free the sync object */ amdgpu_sync_free(&mem->sync); @@ -2814,7 +2822,7 @@ int amdgpu_amdkfd_add_gws_to_process(void *info, void *gws, struct kgd_mem **mem bo_reservation_failure: mutex_unlock(&(*mem)->process_info->lock); amdgpu_sync_free(&(*mem)->sync); - remove_kgd_mem_from_kfd_bo_list(*mem, process_info); + remove_kgd_mem_from_validate_list(*mem, process_info); amdgpu_bo_unref(&gws_bo); mutex_destroy(&(*mem)->lock); kfree(*mem); @@ -2832,7 +2840,7 @@ int amdgpu_amdkfd_remove_gws_from_process(void *info, void *mem) /* Remove BO from process's validate list so restore worker won't touch * it anymore */ - remove_kgd_mem_from_kfd_bo_list(kgd_mem, process_info); + remove_kgd_mem_from_validate_list(kgd_mem, process_info); ret = amdgpu_bo_reserve(gws_bo, false); if (unlikely(ret)) { -- 2.35.1