Re: [PATCH 1/2] drm/amdgpu : Add hive ras recovery check

"Lazar, Lijo" <lijo.lazar@xxxxxxx> · Tue, 17 Oct 2023 10:14:12 +0530

On 10/17/2023 9:58 AM, Asad Kamal wrote:
Add hive ras recovery check and propagate fatal
error to aids of all sockets in the hive

May be reword it as 'If one of the devices in the hive detects a fatal 
error, need to send ras recovery reset message to PMFW of all devices in 
the hive. For that add a flag in hive to indicate that it's undergoing 
ras recovery'.

One other comment inline.

Series is-

	Reviewed-by: Lijo Lazar <lijo.lazar@xxxxxxx>


Signed-off-by: Asad Kamal <asad.kamal@xxxxxxx>
Reviewed-by: Hawking Zhang <Hawking.Zhang@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c              |  9 +++++++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h             |  1 +
  drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 10 +++++++++-
  3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5fb57419ef77..029871bfe714 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2061,9 +2061,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
  	struct amdgpu_device *remote_adev = NULL;
  	struct amdgpu_device *adev = ras->adev;
  	struct list_head device_list, *device_list_handle =  NULL;
+	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
  
+	if (hive)
+		atomic_set(&hive->ras_recovery, 1);
  	if (!ras->disable_ras_err_cnt_harvest) {
-		struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
  
  		/* Build list of devices to query RAS related errors */
  		if  (hive && adev->gmc.xgmi.num_physical_nodes > 1) {
@@ -2080,7 +2082,6 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
  			amdgpu_ras_log_on_err_counter(remote_adev);
  		}
  
-		amdgpu_put_xgmi_hive(hive);
  	}
  
  	if (amdgpu_device_should_recover_gpu(ras->adev)) {
@@ -2115,6 +2116,10 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
  		amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
  	}
  	atomic_set(&ras->in_recovery, 0);
+	if (hive) {
+		atomic_set(&hive->ras_recovery, 0);
+		amdgpu_put_xgmi_hive(hive);
+	}
  }
  
  /* alloc/realloc bps array */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 86fbf56938f4..6cab882e8061 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -44,6 +44,7 @@ struct amdgpu_hive_info {
  
  	struct amdgpu_reset_domain *reset_domain;
  	uint32_t device_remove_count;
+	atomic_t ras_recovery;
  };
  
  struct amdgpu_pcs_ras_field {
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 8220bdcbd927..29bb2a3a3cb1 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2184,13 +2184,21 @@ static int smu_v13_0_6_mode1_reset(struct smu_context *smu)
  	struct amdgpu_ras *ras;
  	u32 fatal_err, param;
  	int ret = 0;
+	struct amdgpu_hive_info *hive = NULL;
+	u32 hive_ras_recovery = 0;

Better to keep reverse christmas tree order for the whole set of 
declarations.

Thanks,
Lijo

  
+	hive = amdgpu_get_xgmi_hive(adev);
  	ras = amdgpu_ras_get_context(adev);
  	fatal_err = 0;
  	param = SMU_RESET_MODE_1;
  
+	if (hive) {
+		hive_ras_recovery = atomic_read(&hive->ras_recovery);
+		amdgpu_put_xgmi_hive(hive);
+	}
+
  	/* fatal error triggered by ras, PMFW supports the flag */
-	if (ras && atomic_read(&ras->in_recovery))
+	if (ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery))
  		fatal_err = 1;
  
  	param |= (fatal_err << 16);