RE: [PATCH 1/2] drm/amdgpu: initialize ras structures for xgmi block

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



I'd like to keep the conditional check in amdgpu_xgmi_ras_late_init internal so that it can be used anywhere without any conditional check from external. Please check v2.

Regards,
Hawking

-----Original Message-----
From: Chen, Guchun <Guchun.Chen@xxxxxxx> 
Sent: 2019年9月9日 9:22
To: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Li, Dennis <Dennis.Li@xxxxxxx>; Deucher, Alexander <Alexander.Deucher@xxxxxxx>
Subject: RE: [PATCH 1/2] drm/amdgpu: initialize ras structures for xgmi block



-----Original Message-----
From: Zhang, Hawking <Hawking.Zhang@xxxxxxx> 
Sent: Monday, September 9, 2019 6:07 AM
To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Chen, Guchun <Guchun.Chen@xxxxxxx>; Li, Dennis <Dennis.Li@xxxxxxx>; Deucher, Alexander <Alexander.Deucher@xxxxxxx>
Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>
Subject: [PATCH 1/2] drm/amdgpu: initialize ras structures for xgmi block

init ras common interface and fs node for xgmi block

Signed-off-by: Hawking Zhang <Hawking.Zhang@xxxxxxx>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h  |  1 +  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 36 ++++++++++++++++++++++++++++++++  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  7 +++++++
 4 files changed, 45 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 331ce50..f09bd30 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -120,6 +120,7 @@ struct amdgpu_xgmi {
 	/* gpu list in the same hive */
 	struct list_head head;
 	bool supported;
+	struct ras_common_if *ras_if;
 };
 
 struct amdgpu_gmc {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 65aae75..7f6f2e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -25,6 +25,7 @@
 #include "amdgpu.h"
 #include "amdgpu_xgmi.h"
 #include "amdgpu_smu.h"
+#include "amdgpu_ras.h"
 #include "df/df_3_6_offset.h"
 
 static DEFINE_MUTEX(xgmi_mutex);
@@ -437,3 +438,38 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
 		mutex_unlock(&hive->hive_lock);
 	}
 }
+
+int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev) {
+	int r;
+	struct ras_ih_if ih_info = {
+		.cb = NULL,
+	};
+	struct ras_fs_if fs_info = {
+		.sysfs_name = "xgmi_wafl_err_count",
+		.debugfs_name = "xgmi_wafl_err_inject",
+	};
+
+	if (!adev->gmc.xgmi.supported ||
+	    adev->gmc.xgmi.num_physical_nodes == 0)
+		return 0;
[Guchun]Looks this check num_physical_nodes==0 is redundant, as there is already one check outside of this function.
Is it better to move these two conditions outside of this function, to allow function entrance once xgmi.supported is true and num_physical_nodes > 0?
	if (adev->gmc.xgmi.num_physical_nodes > 1) {
		r = amdgpu_xgmi_ras_late_init(adev);
		if (r)
			return r;
	}

+	if (!adev->gmc.xgmi.ras_if) {
+		adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
+		if (!adev->gmc.xgmi.ras_if)
+			return -ENOMEM;
+		adev->gmc.xgmi.ras_if->block = AMDGPU_RAS_BLOCK__XGMI_WAFL;
+		adev->gmc.xgmi.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+		adev->gmc.xgmi.ras_if->sub_block_index = 0;
+		strcpy(adev->gmc.xgmi.ras_if->name, "xgmi_wafl");
+	}
+	ih_info.head = fs_info.head = *adev->gmc.xgmi.ras_if;
+	r = amdgpu_ras_late_init(adev, adev->gmc.xgmi.ras_if,
+				 &fs_info, &ih_info);
+	if (r || !amdgpu_ras_is_supported(adev, adev->gmc.xgmi.ras_if->block)) {
+		kfree(adev->gmc.xgmi.ras_if);
+		adev->gmc.xgmi.ras_if = NULL;
+	}
+
+	return r;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index fbcee31..9023789 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -42,6 +42,7 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);  int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate);  int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
 		struct amdgpu_device *peer_adev);
+int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev);
 
 static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
 		struct amdgpu_device *bo_adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index beb6c84..05a9a8a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -51,6 +51,7 @@
 #include "ivsrcid/vmc/irqsrcs_vmc_1_0.h"
 
 #include "amdgpu_ras.h"
+#include "amdgpu_xgmi.h"
 
 /* add these here since we already include dce12 headers and these are for DCN */
 #define mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION                                                          0x055d
@@ -802,6 +803,12 @@ static int gmc_v9_0_ecc_late_init(void *handle)
 		if (r)
 			return r;
 	}
+
+	if (adev->gmc.xgmi.num_physical_nodes > 1) {
+		r = amdgpu_xgmi_ras_late_init(adev);
+		if (r)
+			return r;
+	}
 	return 0;
 }
 
--
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux