When a gpu in hive is performing ras reset, other gpus in hive do not need to schedule recovery work to reset the gpu. Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 34226ae010c7..cbb4d6ccc420 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2489,6 +2489,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct amdgpu_device *adev = ras->adev; struct list_head device_list, *device_list_handle = NULL; struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + struct amdgpu_ras *tmp_ras; if (hive) { atomic_set(&hive->ras_recovery, 1); @@ -2499,11 +2500,19 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) * as part of recovery. */ list_for_each_entry(remote_adev, &hive->device_list, - gmc.xgmi.head) + gmc.xgmi.head) { + tmp_ras = amdgpu_ras_get_context(remote_adev); + /* When a gpu in hive is performing ras reset, other + * gpus in hive do not need to schedule recovery work + * to reset the gpu. + */ + atomic_set(&tmp_ras->in_recovery, 1); + if (amdgpu_ras_get_fed_status(remote_adev)) { amdgpu_ras_set_fed_all(adev, hive, true); break; } + } } if (!ras->disable_ras_err_cnt_harvest) { @@ -2556,6 +2565,15 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); } + + if (hive) { + list_for_each_entry(remote_adev, &hive->device_list, + gmc.xgmi.head) { + tmp_ras = amdgpu_ras_get_context(remote_adev); + atomic_set(&tmp_ras->in_recovery, 0); + } + } + atomic_set(&ras->in_recovery, 0); if (hive) { atomic_set(&hive->ras_recovery, 0); -- 2.34.1