Re: [PATCH] drm/amdgpu: change reset lock from mutex to rw_semaphore

Christian König <ckoenig.leichtzumerken@xxxxxxxxx> · Thu, 20 Aug 2020 11:10:56 +0200

Am 20.08.20 um 04:09 schrieb Dennis Li:
clients don't need reset-lock for synchronization when no
GPU recovery.

Signed-off-by: Dennis Li <Dennis.Li@xxxxxxx>

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index c8aec832b244..ec11ed2a9ca4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -954,7 +954,7 @@ struct amdgpu_device {
  
  	atomic_t 			in_gpu_reset;
  	enum pp_mp1_state               mp1_state;
-	struct mutex  lock_reset;
+	struct rw_semaphore reset_sem;
  	struct amdgpu_doorbell_index doorbell_index;
  
  	struct mutex			notifier_lock;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 79b397800cbc..0090e850eab9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -101,14 +101,17 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
  
  	file->private_data = adev;
  
-	mutex_lock(&adev->lock_reset);
+	if (down_read_killable(&adev->reset_sem))
+		return -EINTR;

Better use ret = down_read_killable(); if (ret) return ret; here. Same 
for all other places of course.

+
  	if (adev->autodump.dumping.done) {
  		reinit_completion(&adev->autodump.dumping);
  		ret = 0;
  	} else {
  		ret = -EBUSY;
  	}
-	mutex_unlock(&adev->lock_reset);
+
+	up_read(&adev->reset_sem);
  
  	return ret;
  }
@@ -1242,7 +1245,8 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
  	}
  
  	/* Avoid accidently unparking the sched thread during GPU reset */
-	mutex_lock(&adev->lock_reset);
+	if (down_read_killable(&adev->reset_sem))
+		return -EINTR;
  
  	/* hold on the scheduler */
  	for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
@@ -1269,7 +1273,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
  		kthread_unpark(ring->sched.thread);
  	}
  
-	mutex_unlock(&adev->lock_reset);
+	up_read(&adev->reset_sem);
  
  	pm_runtime_mark_last_busy(dev->dev);
  	pm_runtime_put_autosuspend(dev->dev);
@@ -1459,7 +1463,10 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
  		return -ENOMEM;
  
  	/* Avoid accidently unparking the sched thread during GPU reset */
-	mutex_lock(&adev->lock_reset);
+	if (down_read_killable(&adev->reset_sem)) {
+		kfree(fences);
+		return -EINTR;

Maybe better use a "goto err;" style error handling here.

+	}
  
  	/* stop the scheduler */
  	kthread_park(ring->sched.thread);
@@ -1500,7 +1507,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
  	/* restart the scheduler */
  	kthread_unpark(ring->sched.thread);
  
-	mutex_unlock(&adev->lock_reset);
+	up_read(&adev->reset_sem);
  
  	ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 78fd2c9a7b7d..82242e2f5658 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3054,7 +3054,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
  	mutex_init(&adev->virt.vf_errors.lock);
  	hash_init(adev->mn_hash);
  	atomic_set(&adev->in_gpu_reset, 0);
-	mutex_init(&adev->lock_reset);
+	init_rwsem(&adev->reset_sem);
  	mutex_init(&adev->psp.mutex);
  	mutex_init(&adev->notifier_lock);
  
@@ -4206,7 +4206,7 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev)
  	if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
  		return false;
  
-	mutex_lock(&adev->lock_reset);
+	down_write(&adev->reset_sem);
  
  	atomic_inc(&adev->gpu_reset_counter);
  	switch (amdgpu_asic_reset_method(adev)) {
@@ -4229,7 +4229,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
  	amdgpu_vf_error_trans_all(adev);
  	adev->mp1_state = PP_MP1_STATE_NONE;
  	atomic_set(&adev->in_gpu_reset, 0);
-	mutex_unlock(&adev->lock_reset);
+	up_write(&adev->reset_sem);
  }
  
  static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index f27d83f2de78..8ac63f13fc6f 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -238,19 +238,12 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
  	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
  	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
  	int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
-	int locked;
  
  	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
  	 * otherwise the mailbox msg will be ruined/reseted by
  	 * the VF FLR.
-	 *
-	 * we can unlock the lock_reset to allow "amdgpu_job_timedout"
-	 * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
-	 * which means host side had finished this VF's FLR.
  	 */
-	locked = mutex_trylock(&adev->lock_reset);
-	if (locked)
-		atomic_set(&adev->in_gpu_reset, 1);
+	down_read(&adev->reset_sem);

Somebody from the virtualization team should take a look at this as well.

Christian.

  
  	do {
  		if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
@@ -261,10 +254,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
  	} while (timeout > 1);
  
  flr_done:
-	if (locked) {
-		atomic_set(&adev->in_gpu_reset, 0);
-		mutex_unlock(&adev->lock_reset);
-	}
+	up_read(&adev->reset_sem);
  
  	/* Trigger recovery for world switch failure if no TDR */
  	if (amdgpu_device_should_recover_gpu(adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 3cb10ab943a6..bcc583f087e7 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -259,19 +259,12 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
  	struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
  	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
  	int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
-	int locked;
  
  	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
  	 * otherwise the mailbox msg will be ruined/reseted by
  	 * the VF FLR.
-	 *
-	 * we can unlock the lock_reset to allow "amdgpu_job_timedout"
-	 * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
-	 * which means host side had finished this VF's FLR.
  	 */
-	locked = mutex_trylock(&adev->lock_reset);
-	if (locked)
-		atomic_set(&adev->in_gpu_reset, 1);
+	down_read(&adev->reset_sem);
  
  	do {
  		if (xgpu_nv_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
@@ -282,10 +275,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
  	} while (timeout > 1);
  
  flr_done:
-	if (locked) {
-		atomic_set(&adev->in_gpu_reset, 0);
-		mutex_unlock(&adev->lock_reset);
-	}
+	up_read(&adev->reset_sem);
  
  	/* Trigger recovery for world switch failure if no TDR */
  	if (amdgpu_device_should_recover_gpu(adev)

_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx