Add hive ras recovery check and propagate fatal error to aids of all sockets in the hive Signed-off-by: Asad Kamal <asad.kamal@xxxxxxx> Reviewed-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 +++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 1 + drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 10 +++++++++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 5fb57419ef77..029871bfe714 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2061,9 +2061,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct amdgpu_device *remote_adev = NULL; struct amdgpu_device *adev = ras->adev; struct list_head device_list, *device_list_handle = NULL; + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + if (hive) + atomic_set(&hive->ras_recovery, 1); if (!ras->disable_ras_err_cnt_harvest) { - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); /* Build list of devices to query RAS related errors */ if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { @@ -2080,7 +2082,6 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) amdgpu_ras_log_on_err_counter(remote_adev); } - amdgpu_put_xgmi_hive(hive); } if (amdgpu_device_should_recover_gpu(ras->adev)) { @@ -2115,6 +2116,10 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); } atomic_set(&ras->in_recovery, 0); + if (hive) { + atomic_set(&hive->ras_recovery, 0); + amdgpu_put_xgmi_hive(hive); + } } /* alloc/realloc bps array */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index 86fbf56938f4..6cab882e8061 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -44,6 +44,7 @@ struct amdgpu_hive_info { struct amdgpu_reset_domain *reset_domain; uint32_t device_remove_count; + atomic_t ras_recovery; }; struct amdgpu_pcs_ras_field { diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 8220bdcbd927..29bb2a3a3cb1 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -2184,13 +2184,21 @@ static int smu_v13_0_6_mode1_reset(struct smu_context *smu) struct amdgpu_ras *ras; u32 fatal_err, param; int ret = 0; + struct amdgpu_hive_info *hive = NULL; + u32 hive_ras_recovery = 0; + hive = amdgpu_get_xgmi_hive(adev); ras = amdgpu_ras_get_context(adev); fatal_err = 0; param = SMU_RESET_MODE_1; + if (hive) { + hive_ras_recovery = atomic_read(&hive->ras_recovery); + amdgpu_put_xgmi_hive(hive); + } + /* fatal error triggered by ras, PMFW supports the flag */ - if (ras && atomic_read(&ras->in_recovery)) + if (ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) fatal_err = 1; param |= (fatal_err << 16); -- 2.42.0