Am 2022-06-09 um 13:17 schrieb Ramesh Errabolu:
Following error conditions are fixed:
Ensure calls to delete a list element are safe
Unpin MMIO and DOORBELL BOs only after map count goes to zero
Print a warning message if unreserving VMs encounters an error
Signed-off-by: Ramesh Errabolu <Ramesh.Errabolu@xxxxxxx>
---
.../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 38 ++++++++++---------
1 file changed, 20 insertions(+), 18 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index a1de900ba677..78b3efecc2f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1013,14 +1013,14 @@ static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem,
mutex_unlock(&process_info->lock);
}
-static void remove_kgd_mem_from_kfd_bo_list(struct kgd_mem *mem,
+static void remove_kgd_mem_from_validate_list(struct kgd_mem *mem,
struct amdkfd_process_info *process_info)
{
struct ttm_validate_buffer *bo_list_entry;
bo_list_entry = &mem->validate_list;
mutex_lock(&process_info->lock);
- list_del(&bo_list_entry->head);
+ list_del_init(&bo_list_entry->head);
mutex_unlock(&process_info->lock);
}
@@ -1796,7 +1796,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
allocate_init_user_pages_failed:
err_pin_bo:
- remove_kgd_mem_from_kfd_bo_list(*mem, avm->process_info);
+ remove_kgd_mem_from_validate_list(*mem, avm->process_info);
drm_vma_node_revoke(&gobj->vma_node, drm_priv);
err_node_allow:
/* Don't unreserve system mem limit twice */
@@ -1825,20 +1825,12 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
unsigned long bo_size = mem->bo->tbo.base.size;
struct kfd_mem_attachment *entry, *tmp;
struct bo_vm_reservation_context ctx;
- struct ttm_validate_buffer *bo_list_entry;
unsigned int mapped_to_gpu_memory;
int ret;
bool is_imported = false;
mutex_lock(&mem->lock);
- /* Unpin MMIO/DOORBELL BO's that were pinned during allocation */
- if (mem->alloc_flags &
- (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
- KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {
- amdgpu_amdkfd_gpuvm_unpin_bo(mem->bo);
- }
-
mapped_to_gpu_memory = mem->mapped_to_gpu_memory;
is_imported = mem->is_imported;
mutex_unlock(&mem->lock);
@@ -1852,11 +1844,10 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
return -EBUSY;
}
- /* Make sure restore workers don't access the BO any more */
- bo_list_entry = &mem->validate_list;
- mutex_lock(&process_info->lock);
- list_del(&bo_list_entry->head);
- mutex_unlock(&process_info->lock);
+ /* Make sure restore workers don't access the BO any more
+ * Ensure removal of BO from validate list is reentrant safe
The comment about being reentrant safe belongs in
remove_kgd_mem_from_validate_list. That said, "reentrant safe" is not
the correct term here. See
https://en.wikipedia.org/wiki/Reentrancy_(computing). It refers to
functions that can run multiple times concurrently, or be interrupted in
the middle. Neither of those are applicable here.
The correct term here is "idempotent". See
https://en.wikipedia.org/wiki/Idempotence. It requires that the function
can be called multiple times sequentially without changing the results
beyond the first call.
With that fixed, the patch is
Reviewed-by: Felix Kuehling <Felix.Kuehling@xxxxxxx>
+ */
+ remove_kgd_mem_from_validate_list(mem, process_info);
/* No more MMU notifiers */
amdgpu_mn_unregister(mem->bo);
@@ -1878,7 +1869,18 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
list_for_each_entry_safe(entry, tmp, &mem->attachments, list)
kfd_mem_detach(entry);
+ /* Return success even in case of error */
ret = unreserve_bo_and_vms(&ctx, false, false);
+ if (unlikely(ret)) {
+ WARN_ONCE(ret, "Error in unreserving BO and associated VMs");
+ ret = 0;
+ }
+
+ /* Unpin MMIO/DOORBELL BO's that were pinned during allocation */
+ if (mem->alloc_flags &
+ (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
+ KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))
+ amdgpu_amdkfd_gpuvm_unpin_bo(mem->bo);
/* Free the sync object */
amdgpu_sync_free(&mem->sync);
@@ -2814,7 +2816,7 @@ int amdgpu_amdkfd_add_gws_to_process(void *info, void *gws, struct kgd_mem **mem
bo_reservation_failure:
mutex_unlock(&(*mem)->process_info->lock);
amdgpu_sync_free(&(*mem)->sync);
- remove_kgd_mem_from_kfd_bo_list(*mem, process_info);
+ remove_kgd_mem_from_validate_list(*mem, process_info);
amdgpu_bo_unref(&gws_bo);
mutex_destroy(&(*mem)->lock);
kfree(*mem);
@@ -2832,7 +2834,7 @@ int amdgpu_amdkfd_remove_gws_from_process(void *info, void *mem)
/* Remove BO from process's validate list so restore worker won't touch
* it anymore
*/
- remove_kgd_mem_from_kfd_bo_list(kgd_mem, process_info);
+ remove_kgd_mem_from_validate_list(kgd_mem, process_info);
ret = amdgpu_bo_reserve(gws_bo, false);
if (unlikely(ret)) {