[AMD Official Use Only - AMD Internal Distribution Only] Series is Reviewed-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> Regards, Hawking -----Original Message----- From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Tao Zhou Sent: Friday, May 31, 2024 18:49 To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx> Subject: [PATCH 5/5] drm/amdgpu: add ras fatal flag to distingush fatal error reset Check it in mode1 reset. Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 32 ++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 1 + .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 2 +- .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 2 +- .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 2 +- 6 files changed, 37 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 2071e30d7e56..97b770ba6424 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2451,6 +2451,26 @@ bool amdgpu_ras_in_recovery(struct amdgpu_device *adev) return false; } +bool amdgpu_ras_in_fatal(struct amdgpu_device *adev) { + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + int hive_ras_fatal = 0; + + if (!amdgpu_ras_in_recovery(adev)) + return false; + + if (hive) { + hive_ras_fatal = atomic_read(&hive->ras_fatal); + amdgpu_put_xgmi_hive(hive); + } + + if (ras && (atomic_read(&ras->in_fatal) || hive_ras_fatal)) + return true; + + return false; +} + static void amdgpu_ras_do_recovery(struct work_struct *work) { struct amdgpu_ras *ras = @@ -2462,6 +2482,8 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) if (hive) { atomic_set(&hive->ras_recovery, 1); + if (atomic_read(&ras->in_fatal)) + atomic_set(&hive->ras_fatal, 1); /* If any device which is part of the hive received RAS fatal * error interrupt, set fatal error status on all. This @@ -2526,8 +2548,10 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); } atomic_set(&ras->in_recovery, 0); + atomic_set(&ras->in_fatal, 0); if (hive) { atomic_set(&hive->ras_recovery, 0); + atomic_set(&hive->ras_fatal, 0); amdgpu_put_xgmi_hive(hive); } } @@ -2982,6 +3006,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) mutex_init(&con->recovery_lock); INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); atomic_set(&con->in_recovery, 0); + atomic_set(&con->in_fatal, 0); con->eeprom_control.bad_channel_bitmap = 0; max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control); @@ -4006,8 +4031,13 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, bool fatal) ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; } - if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) + if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) { + if (fatal) + atomic_set(&ras->in_fatal, 1); + amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); + } + return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index ed5793458a70..444a7fb7fbe3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -489,6 +489,7 @@ struct amdgpu_ras { /* gpu recovery */ struct work_struct recovery_work; atomic_t in_recovery; + atomic_t in_fatal; struct amdgpu_device *adev; /* error handler data */ struct ras_err_handler_data *eh_data; @@ -953,6 +954,7 @@ int amdgpu_ras_put_poison_req(struct amdgpu_device *adev, pasid_notify pasid_fn, void *data, uint32_t reset); bool amdgpu_ras_in_recovery(struct amdgpu_device *adev); +bool amdgpu_ras_in_fatal(struct amdgpu_device *adev); __printf(3, 4) void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index a3bfc16de6d4..a6d6272a4ec6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -44,6 +44,7 @@ struct amdgpu_hive_info { struct amdgpu_reset_domain *reset_domain; atomic_t ras_recovery; + atomic_t ras_fatal; struct ras_event_manager event_mgr; }; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c index 04533f99f1e3..a850e7b29d9d 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c @@ -1876,7 +1876,7 @@ static int aldebaran_mode1_reset(struct smu_context *smu) /* fatal error triggered by ras, PMFW supports the flag from 68.44.0 */ if ((smu->smc_fw_version >= 0x00442c00) && - amdgpu_ras_in_recovery(adev)) + amdgpu_ras_in_fatal(adev)) fatal_err = 1; param |= (fatal_err << 16); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index d1766a603bb9..d6c6c9a08e9d 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -2788,7 +2788,7 @@ static void smu_v13_0_0_set_mode1_reset_param(struct smu_context *smu, struct amdgpu_device *adev = smu->adev; if ((smu->smc_fw_version >= supported_version) && - amdgpu_ras_in_recovery(adev)) + amdgpu_ras_in_fatal(adev)) /* Set RAS fatal error reset flag */ *param = 1 << 16; else diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index c1d7528a6dc8..4434872bbe2e 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -2580,7 +2580,7 @@ static int smu_v13_0_6_mode1_reset(struct smu_context *smu) param = SMU_RESET_MODE_1; /* fatal error triggered by ras, PMFW supports the flag */ - if (amdgpu_ras_in_recovery(adev)) + if (amdgpu_ras_in_fatal(adev)) fatal_err = 1; param |= (fatal_err << 16); -- 2.34.1