[AMD Official Use Only] > -----邮件原件----- > 发件人: Lazar, Lijo <Lijo.Lazar@xxxxxxx> > 发送时间: Thursday, November 18, 2021 7:33 PM > 收件人: Yang, Stanley <Stanley.Yang@xxxxxxx>; amd- > gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; > Clements, John <John.Clements@xxxxxxx>; Quan, Evan > <Evan.Quan@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx> > 主题: Re: [PATCH Review 3/4] drm/amdgpu: add message smu to get > ecc_table v2 > > > > On 11/18/2021 3:03 PM, Stanley.Yang wrote: > > support ECC TABLE message, this table include umc ras error count and > > error address > > > > v2: > > add smu version check to query whether support ecctable > > call smu_cmn_update_table to get ecctable directly > > > > Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx> > > --- > > drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h | 8 +++ > > drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 14 ++++ > > .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 70 > +++++++++++++++++++ > > .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 2 + > > 4 files changed, 94 insertions(+) > > > > diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h > > b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h > > index 3557f4e7fc30..7a06021a58f0 100644 > > --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h > > +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h > > @@ -324,6 +324,7 @@ enum smu_table_id > > SMU_TABLE_OVERDRIVE, > > SMU_TABLE_I2C_COMMANDS, > > SMU_TABLE_PACE, > > + SMU_TABLE_ECCINFO, > > SMU_TABLE_COUNT, > > }; > > > > @@ -340,6 +341,7 @@ struct smu_table_context > > void *max_sustainable_clocks; > > struct smu_bios_boot_up_values boot_values; > > void *driver_pptable; > > + void *ecc_table; > > struct smu_table tables[SMU_TABLE_COUNT]; > > /* > > * The driver table is just a staging buffer for @@ -1261,6 > > +1263,11 @@ struct pptable_funcs { > > * > of SMUBUS table. > > */ > > int (*send_hbm_bad_pages_num)(struct smu_context *smu, > uint32_t > > size); > > + > > + /** > > + * @get_ecc_table: message SMU to get ECC INFO table. > > + */ > > + ssize_t (*get_ecc_info)(struct smu_context *smu, void *table); > > }; > > > > typedef enum { > > @@ -1397,6 +1404,7 @@ int smu_set_light_sbr(struct smu_context *smu, > > bool enable); > > > > int smu_wait_for_event(struct amdgpu_device *adev, enum > smu_event_type event, > > uint64_t event_arg); > > +int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc); > > > > #endif > > #endif > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > > b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > > index 01168b8955bf..fd3b6b460b12 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > > +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > > @@ -3072,6 +3072,20 @@ int smu_set_light_sbr(struct smu_context *smu, > bool enable) > > return ret; > > } > > > > +int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc) { > > + int ret = -EOPNOTSUPP; > > + > > + mutex_lock(&smu->mutex); > > + if (smu->ppt_funcs && > > + smu->ppt_funcs->get_ecc_info) > > + ret = smu->ppt_funcs->get_ecc_info(smu, umc_ecc); > > + mutex_unlock(&smu->mutex); > > + > > + return ret; > > + > > +} > > + > > static int smu_get_prv_buffer_details(void *handle, void **addr, size_t > *size) > > { > > struct smu_context *smu = handle; > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > > b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > > index f835d86cc2f5..4c21609ccea5 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > > @@ -78,6 +78,12 @@ > > > > #define smnPCIE_ESM_CTRL 0x111003D0 > > > > +/* > > + * SMU support ECCTABLE since version 68.42.0, > > + * use this to check ECCTALE feature whether support */ #define > > +SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00 > > + > > static const struct smu_temperature_range smu13_thermal_policy[] = > > { > > {-273150, 99000, 99000, -273150, 99000, 99000, -273150, 99000, > > 99000}, @@ -190,6 +196,7 @@ static const struct cmn2asic_mapping > aldebaran_table_map[SMU_TABLE_COUNT] = { > > TAB_MAP(SMU_METRICS), > > TAB_MAP(DRIVER_SMU_CONFIG), > > TAB_MAP(I2C_COMMANDS), > > + TAB_MAP(ECCINFO), > > }; > > > > static const uint8_t aldebaran_throttler_map[] = { @@ -223,6 +230,9 > > @@ static int aldebaran_tables_init(struct smu_context *smu) > > SMU_TABLE_INIT(tables, SMU_TABLE_I2C_COMMANDS, > sizeof(SwI2cRequest_t), > > PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM); > > > > + SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO, > sizeof(EccInfoTable_t), > > + PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM); > > + > > smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t), > GFP_KERNEL); > > if (!smu_table->metrics_table) > > return -ENOMEM; > > @@ -235,6 +245,10 @@ static int aldebaran_tables_init(struct smu_context > *smu) > > return -ENOMEM; > > } > > > > + smu_table->ecc_table = kzalloc(tables[SMU_TABLE_ECCINFO].size, > GFP_KERNEL); > > + if (!smu_table->ecc_table) > > + return -ENOMEM; > > + > > return 0; > > } > > > > @@ -1765,6 +1779,61 @@ static ssize_t aldebaran_get_gpu_metrics(struct > smu_context *smu, > > return sizeof(struct gpu_metrics_v1_3); > > } > > > > +static int aldebaran_check_ecc_table_support(struct smu_context *smu) > > +{ > > + uint32_t if_version = 0xff, smu_version = 0xff; > > + int ret = 0; > > + > > + ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version); > > + if (ret) > > + ret = -EOPNOTSUPP; // return not support if failed get > smu_version > > + > > + if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION) > > + ret = -EOPNOTSUPP; > > + > > + return ret; > > +} > > + > > +static ssize_t aldebaran_get_ecc_info(struct smu_context *smu, > > + void *table) > > +{ > > + struct smu_table_context *smu_table = &smu->smu_table; > > + EccInfoTable_t *ecc_table = NULL; > > + struct ecc_info_per_ch *ecc_info_per_channel = NULL; > > + int i, ret = 0; > > + struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table; > > + > > Missed to ask last time. Since umc_ecc_info is a common struct, do you also > want to pass back the number of channels having data? > > Now this struct can hold max of 32 channel data. Let's say if the same > interface is going to be used on another ASIC X having only 16 channels. > Then the callback for ASIC X fills data only for 16 channels. Or, you expect that > to be taken care at the caller side? [Yang, Stanley] : If ASIC X have only 16 channels, the callback only fill data for 16 channels, and caller side also need consider its own channel number to handle with umc_ecc_info. > > Thanks, > Lijo > > > + ret = aldebaran_check_ecc_table_support(smu); > > + if (ret) > > + return ret; > > + > > + ret = smu_cmn_update_table(smu, > > + SMU_TABLE_ECCINFO, > > + 0, > > + smu_table->ecc_table, > > + false); > > + if (ret) { > > + dev_info(smu->adev->dev, "Failed to export SMU ecc > table!\n"); > > + return ret; > > + } > > + > > + ecc_table = (EccInfoTable_t *)smu_table->ecc_table; > > + > > + for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) { > > + ecc_info_per_channel = &(eccinfo->ecc[i]); > > + ecc_info_per_channel->ce_count_lo_chip = > > + ecc_table->EccInfo[i].ce_count_lo_chip; > > + ecc_info_per_channel->ce_count_hi_chip = > > + ecc_table->EccInfo[i].ce_count_hi_chip; > > + ecc_info_per_channel->mca_umc_status = > > + ecc_table->EccInfo[i].mca_umc_status; > > + ecc_info_per_channel->mca_umc_addr = > > + ecc_table->EccInfo[i].mca_umc_addr; > > + } > > + > > + return ret; > > +} > > + > > static int aldebaran_mode1_reset(struct smu_context *smu) > > { > > u32 smu_version, fatal_err, param; > > @@ -1967,6 +2036,7 @@ static const struct pptable_funcs > aldebaran_ppt_funcs = { > > .i2c_init = aldebaran_i2c_control_init, > > .i2c_fini = aldebaran_i2c_control_fini, > > .send_hbm_bad_pages_num = > aldebaran_smu_send_hbm_bad_page_num, > > + .get_ecc_info = aldebaran_get_ecc_info, > > }; > > > > void aldebaran_set_ppt_funcs(struct smu_context *smu) diff --git > > a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > > b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > > index 4d96099a9bb1..55421ea622fb 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > > @@ -428,8 +428,10 @@ int smu_v13_0_fini_smc_tables(struct > smu_context *smu) > > kfree(smu_table->hardcode_pptable); > > smu_table->hardcode_pptable = NULL; > > > > + kfree(smu_table->ecc_table); > > kfree(smu_table->metrics_table); > > kfree(smu_table->watermarks_table); > > + smu_table->ecc_table = NULL; > > smu_table->metrics_table = NULL; > > smu_table->watermarks_table = NULL; > > smu_table->metrics_time = 0; > >