[AMD Official Use Only - Internal Distribution Only] Reviewed-by: Monk Liu <monk.liu@xxxxxxx> _____________________________________ Monk Liu|GPU Virtualization Team |AMD -----Original Message----- From: Dennis Li <Dennis.Li@xxxxxxx> Sent: Friday, August 21, 2020 11:48 AM To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Kuehling, Felix <Felix.Kuehling@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Koenig, Christian <Christian.Koenig@xxxxxxx>; Liu, Monk <Monk.Liu@xxxxxxx> Cc: Li, Dennis <Dennis.Li@xxxxxxx> Subject: [PATCH v3] drm/amdgpu: change reset lock from mutex to rw_semaphore clients don't need reset-lock for synchronization when no GPU recovery. v2: change to return the return value of down_read_killable. v3: if GPU recovery begin, VF ignore FLR notification. Signed-off-by: Dennis Li <Dennis.Li@xxxxxxx> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index c8aec832b244..ec11ed2a9ca4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -954,7 +954,7 @@ struct amdgpu_device { atomic_t in_gpu_reset; enum pp_mp1_state mp1_state; -struct mutex lock_reset; +struct rw_semaphore reset_sem; struct amdgpu_doorbell_index doorbell_index; struct mutexnotifier_lock; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 79b397800cbc..cc5c7f81c540 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -101,14 +101,18 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file) file->private_data = adev; -mutex_lock(&adev->lock_reset); +ret = down_read_killable(&adev->reset_sem); +if (ret) +return ret; + if (adev->autodump.dumping.done) { reinit_completion(&adev->autodump.dumping); ret = 0; } else { ret = -EBUSY; } -mutex_unlock(&adev->lock_reset); + +up_read(&adev->reset_sem); return ret; } @@ -1242,7 +1246,9 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data) } /* Avoid accidently unparking the sched thread during GPU reset */ -mutex_lock(&adev->lock_reset); +r = down_read_killable(&adev->reset_sem); +if (r) +return r; /* hold on the scheduler */ for (i = 0; i < AMDGPU_MAX_RINGS; i++) { @@ -1269,7 +1275,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data) kthread_unpark(ring->sched.thread); } -mutex_unlock(&adev->lock_reset); +up_read(&adev->reset_sem); pm_runtime_mark_last_busy(dev->dev); pm_runtime_put_autosuspend(dev->dev); @@ -1459,7 +1465,9 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) return -ENOMEM; /* Avoid accidently unparking the sched thread during GPU reset */ -mutex_lock(&adev->lock_reset); +r = down_read_killable(&adev->reset_sem); +if (r) +goto pro_end; /* stop the scheduler */ kthread_park(ring->sched.thread); @@ -1500,13 +1508,14 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) /* restart the scheduler */ kthread_unpark(ring->sched.thread); -mutex_unlock(&adev->lock_reset); +up_read(&adev->reset_sem); ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched); +pro_end: kfree(fences); -return 0; +return r; } static int amdgpu_debugfs_sclk_set(void *data, u64 val) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 78fd2c9a7b7d..82242e2f5658 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3054,7 +3054,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->virt.vf_errors.lock); hash_init(adev->mn_hash); atomic_set(&adev->in_gpu_reset, 0); -mutex_init(&adev->lock_reset); +init_rwsem(&adev->reset_sem); mutex_init(&adev->psp.mutex); mutex_init(&adev->notifier_lock); @@ -4206,7 +4206,7 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev) if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) return false; -mutex_lock(&adev->lock_reset); +down_write(&adev->reset_sem); atomic_inc(&adev->gpu_reset_counter); switch (amdgpu_asic_reset_method(adev)) { @@ -4229,7 +4229,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) amdgpu_vf_error_trans_all(adev); adev->mp1_state = PP_MP1_STATE_NONE; atomic_set(&adev->in_gpu_reset, 0); -mutex_unlock(&adev->lock_reset); +up_write(&adev->reset_sem); } static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index f27d83f2de78..9c07014d9bd6 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -238,19 +238,15 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; -int locked; /* block amdgpu_gpu_recover till msg FLR COMPLETE received, * otherwise the mailbox msg will be ruined/reseted by * the VF FLR. - * - * we can unlock the lock_reset to allow "amdgpu_job_timedout" - * to run gpu_recover() after FLR_NOTIFICATION_CMPL received - * which means host side had finished this VF's FLR. */ -locked = mutex_trylock(&adev->lock_reset); -if (locked) -atomic_set(&adev->in_gpu_reset, 1); +if (!down_read_trylock(&adev->reset_sem)) +return; + +atomic_set(&adev->in_gpu_reset, 1); do { if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL) @@ -261,10 +257,8 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) } while (timeout > 1); flr_done: -if (locked) { -atomic_set(&adev->in_gpu_reset, 0); -mutex_unlock(&adev->lock_reset); -} +atomic_set(&adev->in_gpu_reset, 0); +up_read(&adev->reset_sem); /* Trigger recovery for world switch failure if no TDR */ if (amdgpu_device_should_recover_gpu(adev) diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index 3cb10ab943a6..9c23abf9b140 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -259,19 +259,15 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; -int locked; /* block amdgpu_gpu_recover till msg FLR COMPLETE received, * otherwise the mailbox msg will be ruined/reseted by * the VF FLR. - * - * we can unlock the lock_reset to allow "amdgpu_job_timedout" - * to run gpu_recover() after FLR_NOTIFICATION_CMPL received - * which means host side had finished this VF's FLR. */ -locked = mutex_trylock(&adev->lock_reset); -if (locked) -atomic_set(&adev->in_gpu_reset, 1); +if (!down_read_trylock(&adev->reset_sem)) +return; + +atomic_set(&adev->in_gpu_reset, 1); do { if (xgpu_nv_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL) @@ -282,10 +278,8 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) } while (timeout > 1); flr_done: -if (locked) { -atomic_set(&adev->in_gpu_reset, 0); -mutex_unlock(&adev->lock_reset); -} +atomic_set(&adev->in_gpu_reset, 0); +up_read(&adev->reset_sem); /* Trigger recovery for world switch failure if no TDR */ if (amdgpu_device_should_recover_gpu(adev) -- 2.17.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx