[AMD Official Use Only - Internal Distribution Only] Hi, Christian, Thanks for your review. I will update it according to your suggestion. Best Regards Dennis Li -----Original Message----- From: Christian König <ckoenig.leichtzumerken@xxxxxxxxx> Sent: Thursday, August 20, 2020 5:11 PM To: Li, Dennis <Dennis.Li@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Kuehling, Felix <Felix.Kuehling@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Koenig, Christian <Christian.Koenig@xxxxxxx> Subject: Re: [PATCH] drm/amdgpu: change reset lock from mutex to rw_semaphore Am 20.08.20 um 04:09 schrieb Dennis Li: > clients don't need reset-lock for synchronization when no GPU > recovery. > > Signed-off-by: Dennis Li <Dennis.Li@xxxxxxx> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index c8aec832b244..ec11ed2a9ca4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -954,7 +954,7 @@ struct amdgpu_device { > > atomic_t in_gpu_reset; > enum pp_mp1_state mp1_state; > - struct mutex lock_reset; > + struct rw_semaphore reset_sem; > struct amdgpu_doorbell_index doorbell_index; > > struct mutex notifier_lock; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > index 79b397800cbc..0090e850eab9 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > @@ -101,14 +101,17 @@ static int amdgpu_debugfs_autodump_open(struct > inode *inode, struct file *file) > > file->private_data = adev; > > - mutex_lock(&adev->lock_reset); > + if (down_read_killable(&adev->reset_sem)) > + return -EINTR; Better use ret = down_read_killable(); if (ret) return ret; here. Same for all other places of course. > + > if (adev->autodump.dumping.done) { > reinit_completion(&adev->autodump.dumping); > ret = 0; > } else { > ret = -EBUSY; > } > - mutex_unlock(&adev->lock_reset); > + > + up_read(&adev->reset_sem); > > return ret; > } > @@ -1242,7 +1245,8 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data) > } > > /* Avoid accidently unparking the sched thread during GPU reset */ > - mutex_lock(&adev->lock_reset); > + if (down_read_killable(&adev->reset_sem)) > + return -EINTR; > > /* hold on the scheduler */ > for (i = 0; i < AMDGPU_MAX_RINGS; i++) { @@ -1269,7 +1273,7 @@ > static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data) > kthread_unpark(ring->sched.thread); > } > > - mutex_unlock(&adev->lock_reset); > + up_read(&adev->reset_sem); > > pm_runtime_mark_last_busy(dev->dev); > pm_runtime_put_autosuspend(dev->dev); > @@ -1459,7 +1463,10 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) > return -ENOMEM; > > /* Avoid accidently unparking the sched thread during GPU reset */ > - mutex_lock(&adev->lock_reset); > + if (down_read_killable(&adev->reset_sem)) { > + kfree(fences); > + return -EINTR; Maybe better use a "goto err;" style error handling here. > + } > > /* stop the scheduler */ > kthread_park(ring->sched.thread); > @@ -1500,7 +1507,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) > /* restart the scheduler */ > kthread_unpark(ring->sched.thread); > > - mutex_unlock(&adev->lock_reset); > + up_read(&adev->reset_sem); > > ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched); > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 78fd2c9a7b7d..82242e2f5658 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -3054,7 +3054,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, > mutex_init(&adev->virt.vf_errors.lock); > hash_init(adev->mn_hash); > atomic_set(&adev->in_gpu_reset, 0); > - mutex_init(&adev->lock_reset); > + init_rwsem(&adev->reset_sem); > mutex_init(&adev->psp.mutex); > mutex_init(&adev->notifier_lock); > > @@ -4206,7 +4206,7 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev) > if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) > return false; > > - mutex_lock(&adev->lock_reset); > + down_write(&adev->reset_sem); > > atomic_inc(&adev->gpu_reset_counter); > switch (amdgpu_asic_reset_method(adev)) { @@ -4229,7 +4229,7 @@ > static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) > amdgpu_vf_error_trans_all(adev); > adev->mp1_state = PP_MP1_STATE_NONE; > atomic_set(&adev->in_gpu_reset, 0); > - mutex_unlock(&adev->lock_reset); > + up_write(&adev->reset_sem); > } > > static void amdgpu_device_resume_display_audio(struct amdgpu_device > *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > index f27d83f2de78..8ac63f13fc6f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > @@ -238,19 +238,12 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) > struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); > struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); > int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; > - int locked; > > /* block amdgpu_gpu_recover till msg FLR COMPLETE received, > * otherwise the mailbox msg will be ruined/reseted by > * the VF FLR. > - * > - * we can unlock the lock_reset to allow "amdgpu_job_timedout" > - * to run gpu_recover() after FLR_NOTIFICATION_CMPL received > - * which means host side had finished this VF's FLR. > */ > - locked = mutex_trylock(&adev->lock_reset); > - if (locked) > - atomic_set(&adev->in_gpu_reset, 1); > + down_read(&adev->reset_sem); Somebody from the virtualization team should take a look at this as well. Christian. > > do { > if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL) > @@ -261,10 +254,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) > } while (timeout > 1); > > flr_done: > - if (locked) { > - atomic_set(&adev->in_gpu_reset, 0); > - mutex_unlock(&adev->lock_reset); > - } > + up_read(&adev->reset_sem); > > /* Trigger recovery for world switch failure if no TDR */ > if (amdgpu_device_should_recover_gpu(adev) > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > index 3cb10ab943a6..bcc583f087e7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > @@ -259,19 +259,12 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) > struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); > struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); > int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; > - int locked; > > /* block amdgpu_gpu_recover till msg FLR COMPLETE received, > * otherwise the mailbox msg will be ruined/reseted by > * the VF FLR. > - * > - * we can unlock the lock_reset to allow "amdgpu_job_timedout" > - * to run gpu_recover() after FLR_NOTIFICATION_CMPL received > - * which means host side had finished this VF's FLR. > */ > - locked = mutex_trylock(&adev->lock_reset); > - if (locked) > - atomic_set(&adev->in_gpu_reset, 1); > + down_read(&adev->reset_sem); > > do { > if (xgpu_nv_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL) > @@ -282,10 +275,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) > } while (timeout > 1); > > flr_done: > - if (locked) { > - atomic_set(&adev->in_gpu_reset, 0); > - mutex_unlock(&adev->lock_reset); > - } > + up_read(&adev->reset_sem); > > /* Trigger recovery for world switch failure if no TDR */ > if (amdgpu_device_should_recover_gpu(adev) _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx