Re: 回复: [PATCH Review 4/4] query umc error info from ecc_table

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





On 11/18/2021 9:29 AM, Yang, Stanley wrote:
[AMD Official Use Only]



-----邮件原件-----
发件人: Lazar, Lijo <Lijo.Lazar@xxxxxxx>
发送时间: Wednesday, November 17, 2021 7:15 PM
收件人: Yang, Stanley <Stanley.Yang@xxxxxxx>; amd-
gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking <Hawking.Zhang@xxxxxxx>;
Clements, John <John.Clements@xxxxxxx>; Quan, Evan
<Evan.Quan@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>
主题: Re: [PATCH Review 4/4] query umc error info from ecc_table



On 11/17/2021 3:41 PM, Stanley.Yang wrote:
if smu support ECCTABLE, driver can message smu to get ecc_table then
query umc error info from ECCTABLE apply pmfw version check to ensure
backward compatibility

Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx>
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       | 42 ++++++++---
   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |  7 ++
   drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c       | 71 +++++++++++++--
----
   drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h       |  1 +
   drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     | 12 ++++
   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |  4 ++
   6 files changed, 107 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 90f0db3b4f65..6b0f2ba1e420 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -888,6 +888,38 @@ void amdgpu_ras_mca_query_error_status(struct
amdgpu_device *adev,
   	}
   }

+static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev,
+struct ras_err_data *err_data) {
+	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+	/*
+	 * choosing right query method according to
+	 * whether smu support query error information
+	 */
+	if ((ras->smu_version >= SUPPORT_ECCTABLE_SMU_VERSION) &&
+			!smu_get_ecc_info(&adev->smu, (void *)&(ras-
umc_ecc))) {
+

This version check should be in aldebaran_ppt implementation. In general
the callback will check the FW version that supports ECC table for the
corresponding ASIC. It may return ENOTSUPP or similar if the FW version
doesn't support ECC table and that may be checked here. Keeping
smu_version in ras context is not needed.
[Yang, Stanley] I think just check Aldebaran_ppt callback function is not enough here, considering this scenario using amdgpu driver with get_ecc_info callback function but the pmfw is an older one without ecctable feature. PMFW support ecctable since 68.42.0 for Aldebaran.

What I meant is the FW version check code should be part of aldebaran_get_ecc_info() function, and it shouldn't be here. That function checks if the FW supports ecc_info for aldebaran, if ont returns ENOTSUPP.

Similarly for a newer ASIC, the corresponding ppt_func checks compatibility. That is one of the purposes of ASIC specific callbacks.

Thanks,
Lijo


+		if (adev->umc.ras_funcs &&
+			adev->umc.ras_funcs-
message_smu_query_ras_error_count)
+			adev->umc.ras_funcs-
message_smu_query_ras_error_count(adev,
+err_data);
+
+		if (adev->umc.ras_funcs &&
+			adev->umc.ras_funcs-
message_smu_query_ras_error_address)
+			adev->umc.ras_funcs-
message_smu_query_ras_error_address(adev, err_data);
+	} else {
+		if (adev->umc.ras_funcs &&
+			adev->umc.ras_funcs->query_ras_error_count)
+			adev->umc.ras_funcs->query_ras_error_count(adev,
err_data);
+
+		/* umc query_ras_error_address is also responsible for
clearing
+		 * error status
+		 */
+		if (adev->umc.ras_funcs &&
+		    adev->umc.ras_funcs->query_ras_error_address)
+			adev->umc.ras_funcs-
query_ras_error_address(adev, err_data);
+	}
+}
+
   /* query/inject/cure begin */
   int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
   				  struct ras_query_if *info)
@@ -901,15 +933,7 @@ int amdgpu_ras_query_error_status(struct
amdgpu_device *adev,

   	switch (info->head.block) {
   	case AMDGPU_RAS_BLOCK__UMC:
-		if (adev->umc.ras_funcs &&
-		    adev->umc.ras_funcs->query_ras_error_count)
-			adev->umc.ras_funcs->query_ras_error_count(adev,
&err_data);
-		/* umc query_ras_error_address is also responsible for
clearing
-		 * error status
-		 */
-		if (adev->umc.ras_funcs &&
-		    adev->umc.ras_funcs->query_ras_error_address)
-			adev->umc.ras_funcs-
query_ras_error_address(adev, &err_data);
+		amdgpu_ras_get_ecc_info(adev, &err_data);
   		break;
   	case AMDGPU_RAS_BLOCK__SDMA:
   		if (adev->sdma.funcs->query_ras_error_count) { diff --git
a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index bcbf3264d92f..3f0de0cc8403 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -322,6 +322,12 @@ struct ras_common_if {

   #define MAX_UMC_CHANNEL_NUM 32

+/*
+ * SMU support ECCTABLE since version 68.42.0,
+ * use this to decide query umc error info method  */ #define
+SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
+
   struct ecc_info_per_ch {
   	uint16_t ce_count_lo_chip;
   	uint16_t ce_count_hi_chip;
@@ -375,6 +381,7 @@ struct amdgpu_ras {

   	/* record umc error info queried from smu */
   	struct umc_ecc_info umc_ecc;
+	uint32_t smu_version;
   };

   struct ras_fs_data {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 0c7c56a91b25..2c3e97c9410b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -97,28 +97,57 @@ int amdgpu_umc_process_ras_data_cb(struct
amdgpu_device *adev,
   	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);

   	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
-	if (adev->umc.ras_funcs &&
-	    adev->umc.ras_funcs->query_ras_error_count)
-	    adev->umc.ras_funcs->query_ras_error_count(adev,
ras_error_status);

-	if (adev->umc.ras_funcs &&
-	    adev->umc.ras_funcs->query_ras_error_address &&
-	    adev->umc.max_ras_err_cnt_per_query) {
-		err_data->err_addr =
-			kcalloc(adev->umc.max_ras_err_cnt_per_query,
-				sizeof(struct eeprom_table_record),
GFP_KERNEL);
-
-		/* still call query_ras_error_address to clear error status
-		 * even NOMEM error is encountered
-		 */
-		if(!err_data->err_addr)
-			dev_warn(adev->dev, "Failed to alloc memory for "
-					"umc error address record!\n");
-
-		/* umc query_ras_error_address is also responsible for
clearing
-		 * error status
-		 */
-		adev->umc.ras_funcs->query_ras_error_address(adev,
ras_error_status);
+	if ((con->smu_version >= SUPPORT_ECCTABLE_SMU_VERSION) &&
+			!smu_get_ecc_info(&adev->smu, (void *)&(con-
umc_ecc))) {
+
Same comment as above.

+		if (adev->umc.ras_funcs &&
+		    adev->umc.ras_funcs-
message_smu_query_ras_error_count)
+		    adev->umc.ras_funcs-
message_smu_query_ras_error_count(adev,
+ras_error_status);
+
+		if (adev->umc.ras_funcs &&
+		    adev->umc.ras_funcs-
message_smu_query_ras_error_address &&
+		    adev->umc.max_ras_err_cnt_per_query) {
+			err_data->err_addr =
+				kcalloc(adev-
umc.max_ras_err_cnt_per_query,
+					sizeof(struct eeprom_table_record),
GFP_KERNEL);
+
+			/* still call query_ras_error_address to clear error
status
+			 * even NOMEM error is encountered
+			 */
+			if(!err_data->err_addr)
+				dev_warn(adev->dev, "Failed to alloc
memory for "
+						"umc error address
record!\n");
+
+			/* umc query_ras_error_address is also responsible
for clearing
+			 * error status
+			 */
+			adev->umc.ras_funcs-
message_smu_query_ras_error_address(adev, ras_error_status);
+		}
+	} else {
+		if (adev->umc.ras_funcs &&
+		    adev->umc.ras_funcs->query_ras_error_count)
+		    adev->umc.ras_funcs->query_ras_error_count(adev,
+ras_error_status);
+
+		if (adev->umc.ras_funcs &&
+		    adev->umc.ras_funcs->query_ras_error_address &&
+		    adev->umc.max_ras_err_cnt_per_query) {
+			err_data->err_addr =
+				kcalloc(adev-
umc.max_ras_err_cnt_per_query,
+					sizeof(struct eeprom_table_record),
GFP_KERNEL);
+
+			/* still call query_ras_error_address to clear error
status
+			 * even NOMEM error is encountered
+			 */
+			if(!err_data->err_addr)
+				dev_warn(adev->dev, "Failed to alloc
memory for "
+						"umc error address
record!\n");
+
+			/* umc query_ras_error_address is also responsible
for clearing
+			 * error status
+			 */
+			adev->umc.ras_funcs-
query_ras_error_address(adev, ras_error_status);
+		}
   	}

   	/* only uncorrectable error needs gpu reset */ diff --git
a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
index ea65de0160c3..7a06021a58f0 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
@@ -1404,6 +1404,7 @@ int smu_set_light_sbr(struct smu_context *smu,
bool enable);

   int smu_wait_for_event(struct amdgpu_device *adev, enum
smu_event_type event,
   		       uint64_t event_arg);
+int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc);

   #endif
   #endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 01168b8955bf..6340c079f35e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -3072,6 +3072,18 @@ int smu_set_light_sbr(struct smu_context *smu,
bool enable)
   	return ret;
   }

+int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc) {
+	int ret = -1;
+
+	if (smu->ppt_funcs &&
+		smu->ppt_funcs->get_ecc_info)
+		ret = smu->ppt_funcs->get_ecc_info(smu, umc_ecc);
+

Shouldn't return -1 if ppt func is not present. If ppt func is not present, that
means this method is not supported for the SOC; return ENOTSUPP.
[Yang, Stanley] thanks, return -ENOTSUPP is more reasonable.

+	return ret;
+
+}
+

Probably the above function should be clubbed with patch 3 - smu support
for getting ras ecc info.

   static int smu_get_prv_buffer_details(void *handle, void **addr, size_t
*size)
   {
   	struct smu_context *smu = handle;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 55421ea622fb..55ef10ca684a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -200,11 +200,15 @@ int smu_v13_0_check_fw_version(struct
smu_context *smu)
   	uint16_t smu_major;
   	uint8_t smu_minor, smu_debug;
   	int ret = 0;
+	struct amdgpu_ras *ras = amdgpu_ras_get_context(smu->adev);

   	ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version);
   	if (ret)
   		return ret;

+	/* record smu interface version, help umc query error method */
+	ras->smu_version = smu_version;
+

This is not needed. ASIC specific functions can check the FW version for ECC
table support.

Thanks,
Lijo

   	smu_major = (smu_version >> 16) & 0xffff;
   	smu_minor = (smu_version >> 8) & 0xff;
   	smu_debug = (smu_version >> 0) & 0xff;




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux