Re: [PATCH 2/2] drm/amdkfd: Fix double release compute pasid

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 2022-12-14 10:42, Philip Yang wrote:
If kfd_process_device_init_vm returns failure after vm is converted to
compute vm and vm->pasid set to compute pasid, KFD will not take
pdd->drm_file reference. As a result, drm close file handler maybe
called to release the compute pasid before KFD process destroy worker to
release the same pasid and set vm->pasid to zero, this generates below
WARNING backtrace and NULL pointer access.

Add helper amdgpu_amdkfd_gpuvm_set_vm_pasid and call it at the last step
of kfd_process_device_init_vm, to ensure vm pasid is the original pasid
if acquiring vm failed or is the compute pasid with pdd->drm_file
reference taken to avoid double release same pasid.

  amdgpu: Failed to create process VM object
  ida_free called for id=32770 which is not allocated.
  WARNING: CPU: 57 PID: 72542 at ../lib/idr.c:522 ida_free+0x96/0x140
  RIP: 0010:ida_free+0x96/0x140
  Call Trace:
   amdgpu_pasid_free_delayed+0xe1/0x2a0 [amdgpu]
   amdgpu_driver_postclose_kms+0x2d8/0x340 [amdgpu]
   drm_file_free.part.13+0x216/0x270 [drm]
   drm_close_helper.isra.14+0x60/0x70 [drm]
   drm_release+0x6e/0xf0 [drm]
   __fput+0xcc/0x280
   ____fput+0xe/0x20
   task_work_run+0x96/0xc0
   do_exit+0x3d0/0xc10

  BUG: kernel NULL pointer dereference, address: 0000000000000000
  RIP: 0010:ida_free+0x76/0x140
  Call Trace:
   amdgpu_pasid_free_delayed+0xe1/0x2a0 [amdgpu]
   amdgpu_driver_postclose_kms+0x2d8/0x340 [amdgpu]
   drm_file_free.part.13+0x216/0x270 [drm]
   drm_close_helper.isra.14+0x60/0x70 [drm]
   drm_release+0x6e/0xf0 [drm]
   __fput+0xcc/0x280
   ____fput+0xe/0x20
   task_work_run+0x96/0xc0
   do_exit+0x3d0/0xc10

Suggested-by: Felix Kuehling <Felix.Kuehling@xxxxxxx>

I don't think I suggested this fix. I didn't realize that the problem only affected the case where kfd_process_device_init_vm fails. Anyway, the series is

Reviewed-by: Felix Kuehling <Felix.Kuehling@xxxxxxx>


Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  4 +-
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 39 +++++++++++++------
  drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 12 ++++--
  3 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 589939631ed4..0040deaf8a83 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -270,8 +270,10 @@ int amdgpu_amdkfd_get_pcie_bandwidth_mbytes(struct amdgpu_device *adev, bool is_
  	(&((struct amdgpu_fpriv *)					\
  		((struct drm_file *)(drm_priv))->driver_priv)->vm)
+int amdgpu_amdkfd_gpuvm_set_vm_pasid(struct amdgpu_device *adev,
+				     struct file *filp, u32 pasid);
  int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
-					struct file *filp, u32 pasid,
+					struct file *filp,
  					void **process_info,
  					struct dma_fence **ef);
  void amdgpu_amdkfd_gpuvm_release_process_vm(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 0a854bb8b47e..b15091d8310d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1429,10 +1429,9 @@ static void amdgpu_amdkfd_gpuvm_unpin_bo(struct amdgpu_bo *bo)
  	amdgpu_bo_unreserve(bo);
  }
-int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
-					   struct file *filp, u32 pasid,
-					   void **process_info,
-					   struct dma_fence **ef)
+int amdgpu_amdkfd_gpuvm_set_vm_pasid(struct amdgpu_device *adev,
+				     struct file *filp, u32 pasid)
+
  {
  	struct amdgpu_fpriv *drv_priv;
  	struct amdgpu_vm *avm;
@@ -1443,10 +1442,6 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
  		return ret;
  	avm = &drv_priv->vm;
- /* Already a compute VM? */
-	if (avm->process_info)
-		return -EINVAL;
-
  	/* Free the original amdgpu allocated pasid,
  	 * will be replaced with kfd allocated pasid.
  	 */
@@ -1455,14 +1450,36 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
  		amdgpu_vm_set_pasid(adev, avm, 0);
  	}
- /* Convert VM into a compute VM */
-	ret = amdgpu_vm_make_compute(adev, avm);
+	ret = amdgpu_vm_set_pasid(adev, avm, pasid);
  	if (ret)
  		return ret;
- ret = amdgpu_vm_set_pasid(adev, avm, pasid);
+	return 0;
+}
+
+int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
+					   struct file *filp,
+					   void **process_info,
+					   struct dma_fence **ef)
+{
+	struct amdgpu_fpriv *drv_priv;
+	struct amdgpu_vm *avm;
+	int ret;
+
+	ret = amdgpu_file_to_fpriv(filp, &drv_priv);
  	if (ret)
  		return ret;
+	avm = &drv_priv->vm;
+
+	/* Already a compute VM? */
+	if (avm->process_info)
+		return -EINVAL;
+
+	/* Convert VM into a compute VM */
+	ret = amdgpu_vm_make_compute(adev, avm);
+	if (ret)
+		return ret;
+
  	/* Initialize KFD part of the VM and process info */
  	ret = init_kfd_vm(avm, process_info, ef);
  	if (ret)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 6caa9dd57ff1..51b1683ac5c1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1576,9 +1576,9 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd,
  	p = pdd->process;
  	dev = pdd->dev;
- ret = amdgpu_amdkfd_gpuvm_acquire_process_vm(
-		dev->adev, drm_file, p->pasid,
-		&p->kgd_process_info, &p->ef);
+	ret = amdgpu_amdkfd_gpuvm_acquire_process_vm(dev->adev, drm_file,
+						     &p->kgd_process_info,
+						     &p->ef);
  	if (ret) {
  		pr_err("Failed to create process VM object\n");
  		return ret;
@@ -1593,10 +1593,16 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd,
  	if (ret)
  		goto err_init_cwsr;
+ ret = amdgpu_amdkfd_gpuvm_set_vm_pasid(dev->adev, drm_file, p->pasid);
+	if (ret)
+		goto err_set_pasid;
+
  	pdd->drm_file = drm_file;
return 0; +err_set_pasid:
+	kfd_process_device_destroy_cwsr_dgpu(pdd);
  err_init_cwsr:
  	kfd_process_device_destroy_ib_mem(pdd);
  err_reserve_ib_mem:



[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux