dmaunmap can call ttm_bo_validate, which expects the ttm dma_resv to be held. Acquire the locks in amdgpu_amdkfd_gpuvm_dmaunmap_mem. Because the dmaunmap step can now fail, two new numbers need to be tracked. n_dmaunmap_success tracks the number of devices that have completed dmaunmap. If a device fails to dmaunmap due to a signal interrupt, n_dmaunmap_bos tracks the number of bos on that device that were successfully dmaunmapped. Track those values in struct kgd_mem. This failure can also cause the sync_memory step of the ioctl to be repeated; it is idempotent, so this should not cause any issues. Signed-off-by: David Francis <David.Francis@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 6 ++++- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 23 +++++++++++++++---- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 19 ++++++++++----- 3 files changed, 37 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 3ad8dc523b42..c60564ec4312 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -86,6 +86,10 @@ struct kgd_mem { bool aql_queue; bool is_imported; + + /* Used to track successful dmaunmap across retries in unmap ioctl */ + uint32_t n_dmaunmap_success; + uint32_t n_dmaunmap_bos; }; /* KFD Memory Eviction */ @@ -302,7 +306,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv); int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv); -void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv); +int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv); int amdgpu_amdkfd_gpuvm_sync_memory( struct amdgpu_device *adev, struct kgd_mem *mem, bool intr); int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 54f31a420229..c431132d7cc1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -2102,21 +2102,36 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( return ret; } -void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv) +int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv) { struct kfd_mem_attachment *entry; struct amdgpu_vm *vm; + int ret; + int i = 0; vm = drm_priv_to_vm(drm_priv); mutex_lock(&mem->lock); list_for_each_entry(entry, &mem->attachments, list) { - if (entry->bo_va->base.vm == vm) - kfd_mem_dmaunmap_attachment(mem, entry); - } + if (i >= mem->n_dmaunmap_bos) { + ret = amdgpu_bo_reserve(entry->bo_va->base.bo, false); + if (ret) { + mem->n_dmaunmap_bos = i; + goto out; + } + + if (entry->bo_va->base.vm == vm) + kfd_mem_dmaunmap_attachment(mem, entry); + amdgpu_bo_unreserve(entry->bo_va->base.bo); + } + i++; + } + mem->n_dmaunmap_bos = 0; +out: mutex_unlock(&mem->lock); + return ret; } int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 06988cf1db51..66dee67ad859 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1366,7 +1366,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, { struct kfd_ioctl_unmap_memory_from_gpu_args *args = data; struct kfd_process_device *pdd, *peer_pdd; - void *mem; + struct kgd_mem *mem; long err = 0; uint32_t *devices_arr = NULL, i; bool flush_tlb; @@ -1400,7 +1400,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, goto bind_process_to_device_failed; } - mem = kfd_process_device_translate_handle(pdd, + mem = (struct kgd_mem *)kfd_process_device_translate_handle(pdd, GET_IDR_HANDLE(args->handle)); if (!mem) { err = -ENOMEM; @@ -1414,7 +1414,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, goto get_mem_obj_from_handle_failed; } err = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( - peer_pdd->dev->adev, (struct kgd_mem *)mem, peer_pdd->drm_priv); + peer_pdd->dev->adev, mem, peer_pdd->drm_priv); if (err) { pr_err("Failed to unmap from gpu %d/%d\n", i, args->n_devices); @@ -1426,7 +1426,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, flush_tlb = kfd_flush_tlb_after_unmap(pdd->dev->kfd); if (flush_tlb) { err = amdgpu_amdkfd_gpuvm_sync_memory(pdd->dev->adev, - (struct kgd_mem *) mem, true); + mem, true); if (err) { pr_debug("Sync memory failed, wait interrupted by user signal\n"); goto sync_memory_failed; @@ -1434,7 +1434,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, } /* Flush TLBs after waiting for the page table updates to complete */ - for (i = 0; i < args->n_devices; i++) { + for (i = mem->n_dmaunmap_success; i < args->n_devices; i++) { peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]); if (WARN_ON_ONCE(!peer_pdd)) continue; @@ -1442,8 +1442,14 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT); /* Remove dma mapping after tlb flush to avoid IO_PAGE_FAULT */ - amdgpu_amdkfd_gpuvm_dmaunmap_mem(mem, peer_pdd->drm_priv); + err = amdgpu_amdkfd_gpuvm_dmaunmap_mem(mem, peer_pdd->drm_priv); + if (err) { + pr_debug("DMA unmapping failed, acquire interrupted by user signal\n"); + goto dmaunmap_failed; + } + mem->n_dmaunmap_success = i+1; } + mem->n_dmaunmap_success = 0; mutex_unlock(&p->mutex); @@ -1455,6 +1461,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, get_mem_obj_from_handle_failed: unmap_memory_from_gpu_failed: sync_memory_failed: +dmaunmap_failed: mutex_unlock(&p->mutex); copy_from_user_failed: kfree(devices_arr); -- 2.34.1