Re: [PATCH 5/9] drm/amdkfd: Ensure user queue buffers residency

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 2024-07-15 08:34, Philip Yang wrote:
Add atomic queue_refcount to struct bo_va, return -EBUSY to fail unmap
BO from the GPU if the bo_va queue_refcount is not zero.

Create queue to increase the bo_va queue_refcount, destroy queue to
decrease the bo_va queue_refcount, to ensure the queue buffers mapped on
the GPU when queue is active.

Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx>
---
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 14 ++++++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.h    |  6 ++++
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  3 +-
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  1 +
  drivers/gpu/drm/amd/amdkfd/kfd_queue.c        | 34 ++++++++++++++++---
  5 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 0ab37e7aec26..6d5fd371d5ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1252,7 +1252,7 @@ static int unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx,
  	return ret;
  }
-static void unmap_bo_from_gpuvm(struct kgd_mem *mem,
+static int unmap_bo_from_gpuvm(struct kgd_mem *mem,
  				struct kfd_mem_attachment *entry,
  				struct amdgpu_sync *sync)
  {
@@ -1260,11 +1260,18 @@ static void unmap_bo_from_gpuvm(struct kgd_mem *mem,
  	struct amdgpu_device *adev = entry->adev;
  	struct amdgpu_vm *vm = bo_va->base.vm;
+ if (bo_va->queue_refcount) {
+		pr_debug("bo_va->queue_refcount %d\n", bo_va->queue_refcount);
+		return -EBUSY;
+	}
+
  	amdgpu_vm_bo_unmap(adev, bo_va, entry->va);
amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update); amdgpu_sync_fence(sync, bo_va->last_pt_update);
+
+	return 0;
  }
static int update_gpuvm_pte(struct kgd_mem *mem,
@@ -2191,7 +2198,10 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
  		pr_debug("\t unmap VA 0x%llx - 0x%llx from entry %p\n",
  			 entry->va, entry->va + bo_size, entry);
- unmap_bo_from_gpuvm(mem, entry, ctx.sync);
+		ret = unmap_bo_from_gpuvm(mem, entry, ctx.sync);
+		if (ret)
+			goto unreserve_out;
+
  		entry->is_mapped = false;
mem->mapped_to_gpu_memory--;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index bc42ccbde659..d7e27957013f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -90,6 +90,12 @@ struct amdgpu_bo_va {
  	bool				cleared;
bool is_xgmi;
+
+	/*
+	 * protected by vm reservation lock
+	 * if non-zero, cannot unmap from GPU because user queues may still access it
+	 */
+	unsigned int			queue_refcount;
  };
struct amdgpu_bo {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 202f24ee4bd7..65a37ac5a0f0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1384,8 +1384,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
  		err = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
  			peer_pdd->dev->adev, (struct kgd_mem *)mem, peer_pdd->drm_priv);
  		if (err) {
-			pr_err("Failed to unmap from gpu %d/%d\n",
-			       i, args->n_devices);
+			pr_debug("Failed to unmap from gpu %d/%d\n", i, args->n_devices);
  			goto unmap_memory_from_gpu_failed;
  		}
  		args->n_success = i+1;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index d0dca20849d9..95fbdb12beb1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1291,6 +1291,7 @@ void print_queue_properties(struct queue_properties *q);
  void print_queue(struct queue *q);
  int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo,
  			 u64 expected_size);
+void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo);
  int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties);
  int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index 0e661160c295..3fd386dcb011 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -106,6 +106,7 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_
  	}
*pbo = amdgpu_bo_ref(mapping->bo_va->base.bo);
+	mapping->bo_va->queue_refcount++;
  	return 0;
out_err:
@@ -113,6 +114,19 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_
  	return -EINVAL;
  }
+void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo)
+{
+	if (*bo) {
+		struct amdgpu_bo_va *bo_va;
+
+		bo_va = amdgpu_vm_bo_find(vm, *bo);
+		if (bo_va)
+			bo_va->queue_refcount--;
+	}
+
+	amdgpu_bo_unref(bo);
+}
+
  int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties)
  {
  	struct amdgpu_vm *vm;
@@ -166,10 +180,20 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope
int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties)
  {
-	amdgpu_bo_unref(&properties->wptr_bo);
-	amdgpu_bo_unref(&properties->rptr_bo);
-	amdgpu_bo_unref(&properties->ring_bo);
-	amdgpu_bo_unref(&properties->eop_buf_bo);
-	amdgpu_bo_unref(&properties->cwsr_bo);
+	struct amdgpu_vm *vm;
+	int err;
+
+	vm = drm_priv_to_vm(pdd->drm_priv);
+	err = amdgpu_bo_reserve(vm->root.bo, false);
+	if (err)
+		return err;

The caller never handles these errors. See my comments on patch 2.

Regards,
  Felix


+
+	kfd_queue_buffer_put(vm, &properties->wptr_bo);
+	kfd_queue_buffer_put(vm, &properties->rptr_bo);
+	kfd_queue_buffer_put(vm, &properties->ring_bo);
+	kfd_queue_buffer_put(vm, &properties->eop_buf_bo);
+	kfd_queue_buffer_put(vm, &properties->cwsr_bo);
+
+	amdgpu_bo_unreserve(vm->root.bo);
  	return 0;
  }



[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux