[AMD Official Use Only] > -----邮件原件----- > 发件人: Lazar, Lijo <Lijo.Lazar@xxxxxxx> > 发送时间: Thursday, November 18, 2021 12:04 PM > 收件人: Yang, Stanley <Stanley.Yang@xxxxxxx>; amd- > gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; > Clements, John <John.Clements@xxxxxxx>; Quan, Evan > <Evan.Quan@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx> > 主题: Re: 回复: [PATCH Review 3/4] drm/amdgpu: add message smu to get > ecc_table > > > > On 11/18/2021 9:07 AM, Yang, Stanley wrote: > > [AMD Official Use Only] > > > > > > > >> -----邮件原件----- > >> 发件人: Lazar, Lijo <Lijo.Lazar@xxxxxxx> > >> 发送时间: Wednesday, November 17, 2021 7:24 PM > >> 收件人: Yang, Stanley <Stanley.Yang@xxxxxxx>; amd- > >> gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; > >> Clements, John <John.Clements@xxxxxxx>; Quan, Evan > >> <Evan.Quan@xxxxxxx>; Wang, Yang(Kevin) > <KevinYang.Wang@xxxxxxx> > >> 主题: Re: [PATCH Review 3/4] drm/amdgpu: add message smu to get > >> ecc_table > >> > >> > >> > >> On 11/17/2021 3:41 PM, Stanley.Yang wrote: > >>> support ECC TABLE message, this table include unc ras error count > >>> and error address > >>> > >>> Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx> > >>> --- > >>> drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h | 7 ++++ > >>> .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 38 > >> +++++++++++++++++++ > >>> .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 2 + > >>> drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 24 ++++++++++++ > >>> drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h | 3 ++ > >>> 5 files changed, 74 insertions(+) > >>> > >>> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h > >>> b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h > >>> index 3557f4e7fc30..ea65de0160c3 100644 > >>> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h > >>> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h > >>> @@ -324,6 +324,7 @@ enum smu_table_id > >>> SMU_TABLE_OVERDRIVE, > >>> SMU_TABLE_I2C_COMMANDS, > >>> SMU_TABLE_PACE, > >>> + SMU_TABLE_ECCINFO, > >>> SMU_TABLE_COUNT, > >>> }; > >>> > >>> @@ -340,6 +341,7 @@ struct smu_table_context > >>> void *max_sustainable_clocks; > >>> struct smu_bios_boot_up_values boot_values; > >>> void *driver_pptable; > >>> + void *ecc_table; > >>> struct smu_table tables[SMU_TABLE_COUNT]; > >>> /* > >>> * The driver table is just a staging buffer for @@ -1261,6 > >>> +1263,11 @@ struct pptable_funcs { > >>> * > >> of SMUBUS table. > >>> */ > >>> int (*send_hbm_bad_pages_num)(struct smu_context *smu, > >> uint32_t > >>> size); > >>> + > >>> + /** > >>> + * @get_ecc_table: message SMU to get ECC INFO table. > >>> + */ > >>> + ssize_t (*get_ecc_info)(struct smu_context *smu, void *table); > >>> }; > >>> > >>> typedef enum { > >>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > >>> b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > >>> index f835d86cc2f5..5e4ba0e14a91 100644 > >>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > >>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > >>> @@ -190,6 +190,7 @@ static const struct cmn2asic_mapping > >> aldebaran_table_map[SMU_TABLE_COUNT] = { > >>> TAB_MAP(SMU_METRICS), > >>> TAB_MAP(DRIVER_SMU_CONFIG), > >>> TAB_MAP(I2C_COMMANDS), > >>> + TAB_MAP(ECCINFO), > >>> }; > >>> > >>> static const uint8_t aldebaran_throttler_map[] = { @@ -223,6 > >>> +224,9 @@ static int aldebaran_tables_init(struct smu_context *smu) > >>> SMU_TABLE_INIT(tables, SMU_TABLE_I2C_COMMANDS, > >> sizeof(SwI2cRequest_t), > >>> PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM); > >>> > >>> + SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO, > >> sizeof(EccInfoTable_t), > >>> + PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM); > >>> + > >>> smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t), > >> GFP_KERNEL); > >>> if (!smu_table->metrics_table) > >>> return -ENOMEM; > >>> @@ -235,6 +239,10 @@ static int aldebaran_tables_init(struct > >>> smu_context > >> *smu) > >>> return -ENOMEM; > >>> } > >>> > >>> + smu_table->ecc_table = kzalloc(tables[SMU_TABLE_ECCINFO].size, > >> GFP_KERNEL); > >>> + if (!smu_table->ecc_table) > >>> + return -ENOMEM; > >>> + > >>> return 0; > >>> } > >>> > >>> @@ -1765,6 +1773,35 @@ static ssize_t > >>> aldebaran_get_gpu_metrics(struct > >> smu_context *smu, > >>> return sizeof(struct gpu_metrics_v1_3); > >>> } > >>> > >>> +static ssize_t aldebaran_get_ecc_info(struct smu_context *smu, > >>> + void *table) > >>> +{ > >>> + struct smu_table_context *smu_table = &smu->smu_table; > >>> + EccInfoTable_t ecc_table; > >>> + struct ecc_info_per_ch *ecc_info_per_channel = NULL; > >>> + int i, ret = 0; > >>> + struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table; > >>> + > >>> + ret = smu_cmn_get_ecc_info_table(smu, > >>> + &ecc_table); > >>> + if (ret) > >>> + return ret; > >>> + > >>> + for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) { > >>> + ecc_info_per_channel = &(eccinfo->ecc[i]); > >>> + ecc_info_per_channel->ce_count_lo_chip = > >>> + ecc_table.EccInfo[i].ce_count_lo_chip; > >>> + ecc_info_per_channel->ce_count_hi_chip = > >>> + ecc_table.EccInfo[i].ce_count_hi_chip; > >>> + ecc_info_per_channel->mca_umc_status = > >>> + ecc_table.EccInfo[i].mca_umc_status; > >>> + ecc_info_per_channel->mca_umc_addr = > >>> + ecc_table.EccInfo[i].mca_umc_addr; > >>> + } > >>> + > >>> + return ret; > >>> +} > >>> + > >>> static int aldebaran_mode1_reset(struct smu_context *smu) > >>> { > >>> u32 smu_version, fatal_err, param; @@ -1967,6 +2004,7 @@ static > >>> const struct pptable_funcs > >> aldebaran_ppt_funcs = { > >>> .i2c_init = aldebaran_i2c_control_init, > >>> .i2c_fini = aldebaran_i2c_control_fini, > >>> .send_hbm_bad_pages_num = > >> aldebaran_smu_send_hbm_bad_page_num, > >>> + .get_ecc_info = aldebaran_get_ecc_info, > >>> }; > >>> > >>> void aldebaran_set_ppt_funcs(struct smu_context *smu) diff --git > >>> a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > >>> b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > >>> index 4d96099a9bb1..55421ea622fb 100644 > >>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > >>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > >>> @@ -428,8 +428,10 @@ int smu_v13_0_fini_smc_tables(struct > >> smu_context *smu) > >>> kfree(smu_table->hardcode_pptable); > >>> smu_table->hardcode_pptable = NULL; > >>> > >>> + kfree(smu_table->ecc_table); > >>> kfree(smu_table->metrics_table); > >>> kfree(smu_table->watermarks_table); > >>> + smu_table->ecc_table = NULL; > >>> smu_table->metrics_table = NULL; > >>> smu_table->watermarks_table = NULL; > >>> smu_table->metrics_time = 0; > >>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c > >>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c > >>> index 843d2cbfc71d..e229c9b09d80 100644 > >>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c > >>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c > >>> @@ -983,6 +983,30 @@ int smu_cmn_get_metrics_table(struct > >> smu_context *smu, > >>> return ret; > >>> } > >>> > >>> +int smu_cmn_get_ecc_info_table(struct smu_context *smu, > >>> + void *ecc_table) > >>> +{ > >>> + struct smu_table_context *smu_table= &smu->smu_table; > >>> + uint32_t table_size = > >>> + smu_table->tables[SMU_TABLE_ECCINFO].size; > >>> + int ret = 0; > >>> + > >>> + ret = smu_cmn_update_table(smu, > >>> + SMU_TABLE_ECCINFO, > >>> + 0, > >>> + smu_table->ecc_table, > >>> + false); > >>> + if (ret) { > >>> + dev_info(smu->adev->dev, "Failed to export SMU ecc > >> table!\n"); > >>> + return ret; > >>> + } > >>> + > >>> + if (ecc_table) > >>> + memcpy(ecc_table, smu_table->ecc_table, table_size); > >> > >> This copy to another buffer is redundant. You may use ecc_table > >> directly in the callback, then this method itself looks unnecessary. > >> Instead of calling smu_cmn_get_ecc_info_table(), call > >> smu_cmn_update_table() and copy directly from ecc_table. > > [Yang, Stanley] This design consider to protect ecc_table in further if multi- > thread call smu_cmn_get_ecc_info_table same time, it should add mutex > lock just like metrics table handle if it is necessary, but now test case is simple > I didn't do that. > This is not like a metrics table use case. RAS error harvesting is not a > multithread case. The error registers are cleared after reading, so I thought > it's always expected to be one user at a time. Besides, I don't know if there is > a case where driver needs to report errors from multiple threads. [Yang, Stanley] not ras error harvesting, considering debugfs node file umc_error_cnt and sysfs node file umc_error_cnt, is there any mechanism ensure user read them only one thread on time? > > Thanks, > Lijo > > > >> > >> Thanks, > >> Lijo > >> > >>> + > >>> + return 0; > >>> +} > >>> + > >>> void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t frev, > >>> uint8_t > >> crev) > >>> { > >>> struct metrics_table_header *header = (struct > >>> metrics_table_header *)table; diff --git > >>> a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h > >>> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h > >>> index beea03810bca..0adc5451373b 100644 > >>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h > >>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h > >>> @@ -105,6 +105,9 @@ int smu_cmn_get_metrics_table(struct > >> smu_context *smu, > >>> void *metrics_table, > >>> bool bypass_cache); > >>> > >>> +int smu_cmn_get_ecc_info_table(struct smu_context *smu, > >>> + void *table); > >>> + > >>> void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t frev, > >>> uint8_t crev); > >>> > >>> int smu_cmn_set_mp1_state(struct smu_context *smu, > >>>