回复: [PATCH Review 3/4] drm/amdgpu: add message smu to get ecc_table

"Yang, Stanley" <Stanley.Yang@xxxxxxx> · Thu, 18 Nov 2021 03:37:39 +0000



[AMD Official Use Only]


> -----邮件原件-----
> 发件人: Lazar, Lijo <Lijo.Lazar@xxxxxxx>
> 发送时间: Wednesday, November 17, 2021 7:24 PM
> 收件人: Yang, Stanley <Stanley.Yang@xxxxxxx>; amd-
> gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking <Hawking.Zhang@xxxxxxx>;
> Clements, John <John.Clements@xxxxxxx>; Quan, Evan
> <Evan.Quan@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>
> 主题: Re: [PATCH Review 3/4] drm/amdgpu: add message smu to get
> ecc_table
> 
> 
> 
> On 11/17/2021 3:41 PM, Stanley.Yang wrote:
> > support ECC TABLE message, this table include unc ras error count and
> > error address
> >
> > Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx>
> > ---
> >   drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h       |  7 ++++
> >   .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 38
> +++++++++++++++++++
> >   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |  2 +
> >   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c        | 24 ++++++++++++
> >   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h        |  3 ++
> >   5 files changed, 74 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> > b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> > index 3557f4e7fc30..ea65de0160c3 100644
> > --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> > +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> > @@ -324,6 +324,7 @@ enum smu_table_id
> >   	SMU_TABLE_OVERDRIVE,
> >   	SMU_TABLE_I2C_COMMANDS,
> >   	SMU_TABLE_PACE,
> > +	SMU_TABLE_ECCINFO,
> >   	SMU_TABLE_COUNT,
> >   };
> >
> > @@ -340,6 +341,7 @@ struct smu_table_context
> >   	void				*max_sustainable_clocks;
> >   	struct smu_bios_boot_up_values	boot_values;
> >   	void                            *driver_pptable;
> > +	void                            *ecc_table;
> >   	struct smu_table		tables[SMU_TABLE_COUNT];
> >   	/*
> >   	 * The driver table is just a staging buffer for @@ -1261,6
> > +1263,11 @@ struct pptable_funcs {
> >   	 *
> 		of SMUBUS table.
> >   	 */
> >   	int (*send_hbm_bad_pages_num)(struct smu_context *smu,
> uint32_t
> > size);
> > +
> > +	/**
> > +	 * @get_ecc_table:  message SMU to get ECC INFO table.
> > +	 */
> > +	ssize_t (*get_ecc_info)(struct smu_context *smu, void *table);
> >   };
> >
> >   typedef enum {
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > index f835d86cc2f5..5e4ba0e14a91 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > @@ -190,6 +190,7 @@ static const struct cmn2asic_mapping
> aldebaran_table_map[SMU_TABLE_COUNT] = {
> >   	TAB_MAP(SMU_METRICS),
> >   	TAB_MAP(DRIVER_SMU_CONFIG),
> >   	TAB_MAP(I2C_COMMANDS),
> > +	TAB_MAP(ECCINFO),
> >   };
> >
> >   static const uint8_t aldebaran_throttler_map[] = { @@ -223,6 +224,9
> > @@ static int aldebaran_tables_init(struct smu_context *smu)
> >   	SMU_TABLE_INIT(tables, SMU_TABLE_I2C_COMMANDS,
> sizeof(SwI2cRequest_t),
> >   		       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
> >
> > +	SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO,
> sizeof(EccInfoTable_t),
> > +		       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
> > +
> >   	smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t),
> GFP_KERNEL);
> >   	if (!smu_table->metrics_table)
> >   		return -ENOMEM;
> > @@ -235,6 +239,10 @@ static int aldebaran_tables_init(struct smu_context
> *smu)
> >   		return -ENOMEM;
> >   	}
> >
> > +	smu_table->ecc_table = kzalloc(tables[SMU_TABLE_ECCINFO].size,
> GFP_KERNEL);
> > +	if (!smu_table->ecc_table)
> > +		return -ENOMEM;
> > +
> >   	return 0;
> >   }
> >
> > @@ -1765,6 +1773,35 @@ static ssize_t aldebaran_get_gpu_metrics(struct
> smu_context *smu,
> >   	return sizeof(struct gpu_metrics_v1_3);
> >   }
> >
> > +static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
> > +					 void *table)
> > +{
> > +	struct smu_table_context *smu_table = &smu->smu_table;
> > +	EccInfoTable_t ecc_table;
> > +	struct ecc_info_per_ch *ecc_info_per_channel = NULL;
> > +	int i, ret = 0;
> > +	struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
> > +
> > +	ret = smu_cmn_get_ecc_info_table(smu,
> > +					&ecc_table);
> > +	if (ret)
> > +		return ret;
> > +
> > +	for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> > +		ecc_info_per_channel = &(eccinfo->ecc[i]);
> > +		ecc_info_per_channel->ce_count_lo_chip =
> > +			ecc_table.EccInfo[i].ce_count_lo_chip;
> > +		ecc_info_per_channel->ce_count_hi_chip =
> > +			ecc_table.EccInfo[i].ce_count_hi_chip;
> > +		ecc_info_per_channel->mca_umc_status =
> > +			ecc_table.EccInfo[i].mca_umc_status;
> > +		ecc_info_per_channel->mca_umc_addr =
> > +			ecc_table.EccInfo[i].mca_umc_addr;
> > +	}
> > +
> > +	return ret;
> > +}
> > +
> >   static int aldebaran_mode1_reset(struct smu_context *smu)
> >   {
> >   	u32 smu_version, fatal_err, param;
> > @@ -1967,6 +2004,7 @@ static const struct pptable_funcs
> aldebaran_ppt_funcs = {
> >   	.i2c_init = aldebaran_i2c_control_init,
> >   	.i2c_fini = aldebaran_i2c_control_fini,
> >   	.send_hbm_bad_pages_num =
> aldebaran_smu_send_hbm_bad_page_num,
> > +	.get_ecc_info = aldebaran_get_ecc_info,
> >   };
> >
> >   void aldebaran_set_ppt_funcs(struct smu_context *smu) diff --git
> > a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > index 4d96099a9bb1..55421ea622fb 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > @@ -428,8 +428,10 @@ int smu_v13_0_fini_smc_tables(struct
> smu_context *smu)
> >   	kfree(smu_table->hardcode_pptable);
> >   	smu_table->hardcode_pptable = NULL;
> >
> > +	kfree(smu_table->ecc_table);
> >   	kfree(smu_table->metrics_table);
> >   	kfree(smu_table->watermarks_table);
> > +	smu_table->ecc_table = NULL;
> >   	smu_table->metrics_table = NULL;
> >   	smu_table->watermarks_table = NULL;
> >   	smu_table->metrics_time = 0;
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> > b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> > index 843d2cbfc71d..e229c9b09d80 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> > @@ -983,6 +983,30 @@ int smu_cmn_get_metrics_table(struct
> smu_context *smu,
> >   	return ret;
> >   }
> >
> > +int smu_cmn_get_ecc_info_table(struct smu_context *smu,
> > +				     void *ecc_table)
> > +{
> > +	struct smu_table_context *smu_table= &smu->smu_table;
> > +	uint32_t table_size =
> > +		smu_table->tables[SMU_TABLE_ECCINFO].size;
> > +	int ret = 0;
> > +
> > +	ret = smu_cmn_update_table(smu,
> > +			       SMU_TABLE_ECCINFO,
> > +			       0,
> > +			       smu_table->ecc_table,
> > +			       false);
> > +	if (ret) {
> > +		dev_info(smu->adev->dev, "Failed to export SMU ecc
> table!\n");
> > +		return ret;
> > +	}
> > +
> > +	if (ecc_table)
> > +		memcpy(ecc_table, smu_table->ecc_table, table_size);
> 
> This copy to another buffer is redundant. You may use ecc_table directly in
> the callback, then this method itself looks unnecessary. Instead of calling
> smu_cmn_get_ecc_info_table(), call smu_cmn_update_table() and copy
> directly from ecc_table.
[Yang, Stanley] This design consider to protect ecc_table in further if multi-thread call smu_cmn_get_ecc_info_table same time, it should add mutex lock just like metrics table handle if it is necessary, but now test case is simple I didn't do that.

>
> Thanks,
> Lijo
> 
> > +
> > +	return 0;
> > +}
> > +
> >   void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t frev, uint8_t
> crev)
> >   {
> >   	struct metrics_table_header *header = (struct metrics_table_header
> > *)table; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> > b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> > index beea03810bca..0adc5451373b 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> > @@ -105,6 +105,9 @@ int smu_cmn_get_metrics_table(struct
> smu_context *smu,
> >   			      void *metrics_table,
> >   			      bool bypass_cache);
> >
> > +int smu_cmn_get_ecc_info_table(struct smu_context *smu,
> > +			      void *table);
> > +
> >   void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t frev,
> > uint8_t crev);
> >
> >   int smu_cmn_set_mp1_state(struct smu_context *smu,
> >