回复: [PATCH Review 4/4] query umc error info from ecc_table

"Yang, Stanley" <Stanley.Yang@xxxxxxx> · Thu, 18 Nov 2021 03:59:03 +0000



[AMD Official Use Only]


> -----邮件原件-----
> 发件人: Lazar, Lijo <Lijo.Lazar@xxxxxxx>
> 发送时间: Wednesday, November 17, 2021 7:15 PM
> 收件人: Yang, Stanley <Stanley.Yang@xxxxxxx>; amd-
> gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking <Hawking.Zhang@xxxxxxx>;
> Clements, John <John.Clements@xxxxxxx>; Quan, Evan
> <Evan.Quan@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>
> 主题: Re: [PATCH Review 4/4] query umc error info from ecc_table
> 
> 
> 
> On 11/17/2021 3:41 PM, Stanley.Yang wrote:
> > if smu support ECCTABLE, driver can message smu to get ecc_table then
> > query umc error info from ECCTABLE apply pmfw version check to ensure
> > backward compatibility
> >
> > Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       | 42 ++++++++---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |  7 ++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c       | 71 +++++++++++++--
> ----
> >   drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h       |  1 +
> >   drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     | 12 ++++
> >   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |  4 ++
> >   6 files changed, 107 insertions(+), 30 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > index 90f0db3b4f65..6b0f2ba1e420 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > @@ -888,6 +888,38 @@ void amdgpu_ras_mca_query_error_status(struct
> amdgpu_device *adev,
> >   	}
> >   }
> >
> > +static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev,
> > +struct ras_err_data *err_data) {
> > +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> > +
> > +	/*
> > +	 * choosing right query method according to
> > +	 * whether smu support query error information
> > +	 */
> > +	if ((ras->smu_version >= SUPPORT_ECCTABLE_SMU_VERSION) &&
> > +			!smu_get_ecc_info(&adev->smu, (void *)&(ras-
> >umc_ecc))) {
> > +
> 
> This version check should be in aldebaran_ppt implementation. In general
> the callback will check the FW version that supports ECC table for the
> corresponding ASIC. It may return ENOTSUPP or similar if the FW version
> doesn't support ECC table and that may be checked here. Keeping
> smu_version in ras context is not needed.
[Yang, Stanley] I think just check Aldebaran_ppt callback function is not enough here, considering this scenario using amdgpu driver with get_ecc_info callback function but the pmfw is an older one without ecctable feature. PMFW support ecctable since 68.42.0 for Aldebaran.

> 
> > +		if (adev->umc.ras_funcs &&
> > +			adev->umc.ras_funcs-
> >message_smu_query_ras_error_count)
> > +			adev->umc.ras_funcs-
> >message_smu_query_ras_error_count(adev,
> > +err_data);
> > +
> > +		if (adev->umc.ras_funcs &&
> > +			adev->umc.ras_funcs-
> >message_smu_query_ras_error_address)
> > +			adev->umc.ras_funcs-
> >message_smu_query_ras_error_address(adev, err_data);
> > +	} else {
> > +		if (adev->umc.ras_funcs &&
> > +			adev->umc.ras_funcs->query_ras_error_count)
> > +			adev->umc.ras_funcs->query_ras_error_count(adev,
> err_data);
> > +
> > +		/* umc query_ras_error_address is also responsible for
> clearing
> > +		 * error status
> > +		 */
> > +		if (adev->umc.ras_funcs &&
> > +		    adev->umc.ras_funcs->query_ras_error_address)
> > +			adev->umc.ras_funcs-
> >query_ras_error_address(adev, err_data);
> > +	}
> > +}
> > +
> >   /* query/inject/cure begin */
> >   int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
> >   				  struct ras_query_if *info)
> > @@ -901,15 +933,7 @@ int amdgpu_ras_query_error_status(struct
> > amdgpu_device *adev,
> >
> >   	switch (info->head.block) {
> >   	case AMDGPU_RAS_BLOCK__UMC:
> > -		if (adev->umc.ras_funcs &&
> > -		    adev->umc.ras_funcs->query_ras_error_count)
> > -			adev->umc.ras_funcs->query_ras_error_count(adev,
> &err_data);
> > -		/* umc query_ras_error_address is also responsible for
> clearing
> > -		 * error status
> > -		 */
> > -		if (adev->umc.ras_funcs &&
> > -		    adev->umc.ras_funcs->query_ras_error_address)
> > -			adev->umc.ras_funcs-
> >query_ras_error_address(adev, &err_data);
> > +		amdgpu_ras_get_ecc_info(adev, &err_data);
> >   		break;
> >   	case AMDGPU_RAS_BLOCK__SDMA:
> >   		if (adev->sdma.funcs->query_ras_error_count) { diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > index bcbf3264d92f..3f0de0cc8403 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > @@ -322,6 +322,12 @@ struct ras_common_if {
> >
> >   #define MAX_UMC_CHANNEL_NUM 32
> >
> > +/*
> > + * SMU support ECCTABLE since version 68.42.0,
> > + * use this to decide query umc error info method  */ #define
> > +SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
> > +
> >   struct ecc_info_per_ch {
> >   	uint16_t ce_count_lo_chip;
> >   	uint16_t ce_count_hi_chip;
> > @@ -375,6 +381,7 @@ struct amdgpu_ras {
> >
> >   	/* record umc error info queried from smu */
> >   	struct umc_ecc_info umc_ecc;
> > +	uint32_t smu_version;
> >   };
> >
> >   struct ras_fs_data {
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > index 0c7c56a91b25..2c3e97c9410b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > @@ -97,28 +97,57 @@ int amdgpu_umc_process_ras_data_cb(struct
> amdgpu_device *adev,
> >   	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> >
> >   	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> > -	if (adev->umc.ras_funcs &&
> > -	    adev->umc.ras_funcs->query_ras_error_count)
> > -	    adev->umc.ras_funcs->query_ras_error_count(adev,
> ras_error_status);
> >
> > -	if (adev->umc.ras_funcs &&
> > -	    adev->umc.ras_funcs->query_ras_error_address &&
> > -	    adev->umc.max_ras_err_cnt_per_query) {
> > -		err_data->err_addr =
> > -			kcalloc(adev->umc.max_ras_err_cnt_per_query,
> > -				sizeof(struct eeprom_table_record),
> GFP_KERNEL);
> > -
> > -		/* still call query_ras_error_address to clear error status
> > -		 * even NOMEM error is encountered
> > -		 */
> > -		if(!err_data->err_addr)
> > -			dev_warn(adev->dev, "Failed to alloc memory for "
> > -					"umc error address record!\n");
> > -
> > -		/* umc query_ras_error_address is also responsible for
> clearing
> > -		 * error status
> > -		 */
> > -		adev->umc.ras_funcs->query_ras_error_address(adev,
> ras_error_status);
> > +	if ((con->smu_version >= SUPPORT_ECCTABLE_SMU_VERSION) &&
> > +			!smu_get_ecc_info(&adev->smu, (void *)&(con-
> >umc_ecc))) {
> > +
> Same comment as above.
> 
> > +		if (adev->umc.ras_funcs &&
> > +		    adev->umc.ras_funcs-
> >message_smu_query_ras_error_count)
> > +		    adev->umc.ras_funcs-
> >message_smu_query_ras_error_count(adev,
> > +ras_error_status);
> > +
> > +		if (adev->umc.ras_funcs &&
> > +		    adev->umc.ras_funcs-
> >message_smu_query_ras_error_address &&
> > +		    adev->umc.max_ras_err_cnt_per_query) {
> > +			err_data->err_addr =
> > +				kcalloc(adev-
> >umc.max_ras_err_cnt_per_query,
> > +					sizeof(struct eeprom_table_record),
> GFP_KERNEL);
> > +
> > +			/* still call query_ras_error_address to clear error
> status
> > +			 * even NOMEM error is encountered
> > +			 */
> > +			if(!err_data->err_addr)
> > +				dev_warn(adev->dev, "Failed to alloc
> memory for "
> > +						"umc error address
> record!\n");
> > +
> > +			/* umc query_ras_error_address is also responsible
> for clearing
> > +			 * error status
> > +			 */
> > +			adev->umc.ras_funcs-
> >message_smu_query_ras_error_address(adev, ras_error_status);
> > +		}
> > +	} else {
> > +		if (adev->umc.ras_funcs &&
> > +		    adev->umc.ras_funcs->query_ras_error_count)
> > +		    adev->umc.ras_funcs->query_ras_error_count(adev,
> > +ras_error_status);
> > +
> > +		if (adev->umc.ras_funcs &&
> > +		    adev->umc.ras_funcs->query_ras_error_address &&
> > +		    adev->umc.max_ras_err_cnt_per_query) {
> > +			err_data->err_addr =
> > +				kcalloc(adev-
> >umc.max_ras_err_cnt_per_query,
> > +					sizeof(struct eeprom_table_record),
> GFP_KERNEL);
> > +
> > +			/* still call query_ras_error_address to clear error
> status
> > +			 * even NOMEM error is encountered
> > +			 */
> > +			if(!err_data->err_addr)
> > +				dev_warn(adev->dev, "Failed to alloc
> memory for "
> > +						"umc error address
> record!\n");
> > +
> > +			/* umc query_ras_error_address is also responsible
> for clearing
> > +			 * error status
> > +			 */
> > +			adev->umc.ras_funcs-
> >query_ras_error_address(adev, ras_error_status);
> > +		}
> >   	}
> >
> >   	/* only uncorrectable error needs gpu reset */ diff --git
> > a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> > b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> > index ea65de0160c3..7a06021a58f0 100644
> > --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> > +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> > @@ -1404,6 +1404,7 @@ int smu_set_light_sbr(struct smu_context *smu,
> > bool enable);
> >
> >   int smu_wait_for_event(struct amdgpu_device *adev, enum
> smu_event_type event,
> >   		       uint64_t event_arg);
> > +int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc);
> >
> >   #endif
> >   #endif
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> > b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> > index 01168b8955bf..6340c079f35e 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> > @@ -3072,6 +3072,18 @@ int smu_set_light_sbr(struct smu_context *smu,
> bool enable)
> >   	return ret;
> >   }
> >
> > +int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc) {
> > +	int ret = -1;
> > +
> > +	if (smu->ppt_funcs &&
> > +		smu->ppt_funcs->get_ecc_info)
> > +		ret = smu->ppt_funcs->get_ecc_info(smu, umc_ecc);
> > +
> 
> Shouldn't return -1 if ppt func is not present. If ppt func is not present, that
> means this method is not supported for the SOC; return ENOTSUPP.
[Yang, Stanley] thanks, return -ENOTSUPP is more reasonable.
> 
> > +	return ret;
> > +
> > +}
> > +
> 
> Probably the above function should be clubbed with patch 3 - smu support
> for getting ras ecc info.
> 
> >   static int smu_get_prv_buffer_details(void *handle, void **addr, size_t
> *size)
> >   {
> >   	struct smu_context *smu = handle;
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > index 55421ea622fb..55ef10ca684a 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > @@ -200,11 +200,15 @@ int smu_v13_0_check_fw_version(struct
> smu_context *smu)
> >   	uint16_t smu_major;
> >   	uint8_t smu_minor, smu_debug;
> >   	int ret = 0;
> > +	struct amdgpu_ras *ras = amdgpu_ras_get_context(smu->adev);
> >
> >   	ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version);
> >   	if (ret)
> >   		return ret;
> >
> > +	/* record smu interface version, help umc query error method */
> > +	ras->smu_version = smu_version;
> > +
> 
> This is not needed. ASIC specific functions can check the FW version for ECC
> table support.
> 
> Thanks,
> Lijo
> 
> >   	smu_major = (smu_version >> 16) & 0xffff;
> >   	smu_minor = (smu_version >> 8) & 0xff;
> >   	smu_debug = (smu_version >> 0) & 0xff;
> >