Re: [PATCH v3 1/2] drm/amdgpu: Acquire ttm locks for dmaunmap

Felix Kuehling <felix.kuehling@xxxxxxx> · Tue, 24 Oct 2023 17:02:41 -0400

On 2023-10-24 15:20, David Francis wrote:
dmaunmap can call ttm_bo_validate, which expects the
ttm dma_resv to be held.

Acquire the locks in amdgpu_amdkfd_gpuvm_dmaunmap_mem.

Because the dmaunmap step can now fail, two new numbers
need to be tracked. n_dmaunmap_success tracks the number
of devices that have completed dmaunmap. If a device fails
to dmaunmap due to a signal interrupt, n_dmaunmap_bos tracks
the number of bos on that device that were successfully
dmaunmapped.

I think what you mean here is "tracks the number of attachments on that 
device".



Track those values in struct kgd_mem.

This failure can also cause the sync_memory step of the ioctl
to be repeated; it is idempotent, so this should not cause any issues.

Signed-off-by: David Francis <David.Francis@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  6 ++++-
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 23 +++++++++++++++----
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 19 ++++++++++-----
  3 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 3ad8dc523b42..c60564ec4312 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -86,6 +86,10 @@ struct kgd_mem {
  
  	bool aql_queue;
  	bool is_imported;
+
+	/* Used to track successful dmaunmap across retries in unmap ioctl */
+	uint32_t n_dmaunmap_success;
+	uint32_t n_dmaunmap_bos;
  };
  
  /* KFD Memory Eviction */
@@ -302,7 +306,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(struct amdgpu_device *adev,
  					  struct kgd_mem *mem, void *drm_priv);
  int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
  		struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv);
-void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv);
+int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv);
  int amdgpu_amdkfd_gpuvm_sync_memory(
  		struct amdgpu_device *adev, struct kgd_mem *mem, bool intr);
  int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 54f31a420229..c431132d7cc1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2102,21 +2102,36 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
  	return ret;
  }
  
-void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv)
+int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv)
  {
  	struct kfd_mem_attachment *entry;
  	struct amdgpu_vm *vm;
+	int ret;
+	int i = 0;
  
  	vm = drm_priv_to_vm(drm_priv);
  
  	mutex_lock(&mem->lock);
  
  	list_for_each_entry(entry, &mem->attachments, list) {
-		if (entry->bo_va->base.vm == vm)
-			kfd_mem_dmaunmap_attachment(mem, entry);
-	}
+		if (i >= mem->n_dmaunmap_bos) {
+			ret = amdgpu_bo_reserve(entry->bo_va->base.bo, false);
This will lock and unlock things that aren't needed. This should be 
inside the "if (entry->bo_va->base.vm == vm)" where it's actually 
calling the dmaunmap_attachment.

+			if (ret) {
+				mem->n_dmaunmap_bos = i;

This counting approach feels a bit fragile. Also, what you're counting 
with "i" is not the number of attachments per device, but the number of 
attachments overall. So this double counting with two counters in two 
places is probably redundant and could be simplified to using just one 
counter and in one place.

But this may still have issue in corner cases where multiple unmap 
ioctls are happening concurrently. I'm not sure if this happens in 
practice, but it's something that a robust implementation needs to 
handle. The consequences of getting this wrong would be resource leaks 
or DMA mappings or potentially double-frees of dma mappings.

What would be simpler and more robust is, to have a flag in struct 
kfd_mem_attachment that indicates whether it is currently dma-mapped or 
not. If it's not mapped, you assume it was already unmapped and you 
don't unmap it again. That way you wouldn't need to count at all and it 
handles concurrent calls without problems. There is already an is_mapped 
flag there. You could add an is_dmamapped flag.

Regards,
  Felix


+				goto out;
+			}
+
+			if (entry->bo_va->base.vm == vm)
+				kfd_mem_dmaunmap_attachment(mem, entry);
  
+			amdgpu_bo_unreserve(entry->bo_va->base.bo);
+		}
+		i++;
+	}
+	mem->n_dmaunmap_bos = 0;
+out:
  	mutex_unlock(&mem->lock);
+	return ret;
  }
  
  int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 06988cf1db51..66dee67ad859 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1366,7 +1366,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
  {
  	struct kfd_ioctl_unmap_memory_from_gpu_args *args = data;
  	struct kfd_process_device *pdd, *peer_pdd;
-	void *mem;
+	struct kgd_mem *mem;
  	long err = 0;
  	uint32_t *devices_arr = NULL, i;
  	bool flush_tlb;
@@ -1400,7 +1400,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
  		goto bind_process_to_device_failed;
  	}
  
-	mem = kfd_process_device_translate_handle(pdd,
+	mem = (struct kgd_mem *)kfd_process_device_translate_handle(pdd,
  						GET_IDR_HANDLE(args->handle));
  	if (!mem) {
  		err = -ENOMEM;
@@ -1414,7 +1414,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
  			goto get_mem_obj_from_handle_failed;
  		}
  		err = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
-			peer_pdd->dev->adev, (struct kgd_mem *)mem, peer_pdd->drm_priv);
+			peer_pdd->dev->adev, mem, peer_pdd->drm_priv);
  		if (err) {
  			pr_err("Failed to unmap from gpu %d/%d\n",
  			       i, args->n_devices);
@@ -1426,7 +1426,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
  	flush_tlb = kfd_flush_tlb_after_unmap(pdd->dev->kfd);
  	if (flush_tlb) {
  		err = amdgpu_amdkfd_gpuvm_sync_memory(pdd->dev->adev,
-				(struct kgd_mem *) mem, true);
+				mem, true);
  		if (err) {
  			pr_debug("Sync memory failed, wait interrupted by user signal\n");
  			goto sync_memory_failed;
@@ -1434,7 +1434,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
  	}
  
  	/* Flush TLBs after waiting for the page table updates to complete */
-	for (i = 0; i < args->n_devices; i++) {
+	for (i = mem->n_dmaunmap_success; i < args->n_devices; i++) {
  		peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]);
  		if (WARN_ON_ONCE(!peer_pdd))
  			continue;
@@ -1442,8 +1442,14 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
  			kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT);
  
  		/* Remove dma mapping after tlb flush to avoid IO_PAGE_FAULT */
-		amdgpu_amdkfd_gpuvm_dmaunmap_mem(mem, peer_pdd->drm_priv);
+		err = amdgpu_amdkfd_gpuvm_dmaunmap_mem(mem, peer_pdd->drm_priv);
+		if (err) {
+			pr_debug("DMA unmapping failed, acquire interrupted by user signal\n");
+			goto dmaunmap_failed;
+		}
+		mem->n_dmaunmap_success = i+1;
  	}
+	mem->n_dmaunmap_success = 0;
  
  	mutex_unlock(&p->mutex);
  
@@ -1455,6 +1461,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
  get_mem_obj_from_handle_failed:
  unmap_memory_from_gpu_failed:
  sync_memory_failed:
+dmaunmap_failed:
  	mutex_unlock(&p->mutex);
  copy_from_user_failed:
  	kfree(devices_arr);