[AMD Official Use Only] Series looks good to me. Reviewed-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> Regards, Hawking -----Original Message----- From: Lazar, Lijo <Lijo.Lazar@xxxxxxx> Sent: Thursday, November 18, 2021 22:41 To: Yang, Stanley <Stanley.Yang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Clements, John <John.Clements@xxxxxxx>; Quan, Evan <Evan.Quan@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx> Subject: Re: 回复: [PATCH Review 3/4] drm/amdgpu: add message smu to get ecc_table v2 On 11/18/2021 6:05 PM, Yang, Stanley wrote: > [AMD Official Use Only] > > > >> -----邮件原件----- >> 发件人: Lazar, Lijo <Lijo.Lazar@xxxxxxx> >> 发送时间: Thursday, November 18, 2021 7:33 PM >> 收件人: Yang, Stanley <Stanley.Yang@xxxxxxx>; amd- >> gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; >> Clements, John <John.Clements@xxxxxxx>; Quan, Evan >> <Evan.Quan@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx> >> 主题: Re: [PATCH Review 3/4] drm/amdgpu: add message smu to get >> ecc_table v2 >> >> >> >> On 11/18/2021 3:03 PM, Stanley.Yang wrote: >>> support ECC TABLE message, this table include umc ras error count >>> and error address >>> >>> v2: >>> add smu version check to query whether support ecctable >>> call smu_cmn_update_table to get ecctable directly >>> >>> Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx> >>> --- >>> drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h | 8 +++ >>> drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 14 ++++ >>> .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 70 >> +++++++++++++++++++ >>> .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 2 + >>> 4 files changed, 94 insertions(+) >>> >>> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h >>> b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h >>> index 3557f4e7fc30..7a06021a58f0 100644 >>> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h >>> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h >>> @@ -324,6 +324,7 @@ enum smu_table_id >>> SMU_TABLE_OVERDRIVE, >>> SMU_TABLE_I2C_COMMANDS, >>> SMU_TABLE_PACE, >>> + SMU_TABLE_ECCINFO, >>> SMU_TABLE_COUNT, >>> }; >>> >>> @@ -340,6 +341,7 @@ struct smu_table_context >>> void *max_sustainable_clocks; >>> struct smu_bios_boot_up_values boot_values; >>> void *driver_pptable; >>> + void *ecc_table; >>> struct smu_table tables[SMU_TABLE_COUNT]; >>> /* >>> * The driver table is just a staging buffer for @@ -1261,6 >>> +1263,11 @@ struct pptable_funcs { >>> * >> of SMUBUS table. >>> */ >>> int (*send_hbm_bad_pages_num)(struct smu_context *smu, >> uint32_t >>> size); >>> + >>> + /** >>> + * @get_ecc_table: message SMU to get ECC INFO table. >>> + */ >>> + ssize_t (*get_ecc_info)(struct smu_context *smu, void *table); >>> }; >>> >>> typedef enum { >>> @@ -1397,6 +1404,7 @@ int smu_set_light_sbr(struct smu_context *smu, >>> bool enable); >>> >>> int smu_wait_for_event(struct amdgpu_device *adev, enum >> smu_event_type event, >>> uint64_t event_arg); >>> +int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc); >>> >>> #endif >>> #endif >>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c >>> b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c >>> index 01168b8955bf..fd3b6b460b12 100644 >>> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c >>> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c >>> @@ -3072,6 +3072,20 @@ int smu_set_light_sbr(struct smu_context >>> *smu, >> bool enable) >>> return ret; >>> } >>> >>> +int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc) { >>> + int ret = -EOPNOTSUPP; >>> + >>> + mutex_lock(&smu->mutex); >>> + if (smu->ppt_funcs && >>> + smu->ppt_funcs->get_ecc_info) >>> + ret = smu->ppt_funcs->get_ecc_info(smu, umc_ecc); >>> + mutex_unlock(&smu->mutex); >>> + >>> + return ret; >>> + >>> +} >>> + >>> static int smu_get_prv_buffer_details(void *handle, void **addr, >>> size_t >> *size) >>> { >>> struct smu_context *smu = handle; diff --git >>> a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c >>> b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c >>> index f835d86cc2f5..4c21609ccea5 100644 >>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c >>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c >>> @@ -78,6 +78,12 @@ >>> >>> #define smnPCIE_ESM_CTRL 0x111003D0 >>> >>> +/* >>> + * SMU support ECCTABLE since version 68.42.0, >>> + * use this to check ECCTALE feature whether support */ #define >>> +SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00 >>> + >>> static const struct smu_temperature_range smu13_thermal_policy[] = >>> { >>> {-273150, 99000, 99000, -273150, 99000, 99000, -273150, 99000, >>> 99000}, @@ -190,6 +196,7 @@ static const struct cmn2asic_mapping >> aldebaran_table_map[SMU_TABLE_COUNT] = { >>> TAB_MAP(SMU_METRICS), >>> TAB_MAP(DRIVER_SMU_CONFIG), >>> TAB_MAP(I2C_COMMANDS), >>> + TAB_MAP(ECCINFO), >>> }; >>> >>> static const uint8_t aldebaran_throttler_map[] = { @@ -223,6 >>> +230,9 @@ static int aldebaran_tables_init(struct smu_context *smu) >>> SMU_TABLE_INIT(tables, SMU_TABLE_I2C_COMMANDS, >> sizeof(SwI2cRequest_t), >>> PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM); >>> >>> + SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO, >> sizeof(EccInfoTable_t), >>> + PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM); >>> + >>> smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t), >> GFP_KERNEL); >>> if (!smu_table->metrics_table) >>> return -ENOMEM; >>> @@ -235,6 +245,10 @@ static int aldebaran_tables_init(struct >>> smu_context >> *smu) >>> return -ENOMEM; >>> } >>> >>> + smu_table->ecc_table = kzalloc(tables[SMU_TABLE_ECCINFO].size, >> GFP_KERNEL); >>> + if (!smu_table->ecc_table) >>> + return -ENOMEM; >>> + >>> return 0; >>> } >>> >>> @@ -1765,6 +1779,61 @@ static ssize_t >>> aldebaran_get_gpu_metrics(struct >> smu_context *smu, >>> return sizeof(struct gpu_metrics_v1_3); >>> } >>> >>> +static int aldebaran_check_ecc_table_support(struct smu_context >>> +*smu) { >>> + uint32_t if_version = 0xff, smu_version = 0xff; >>> + int ret = 0; >>> + >>> + ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version); >>> + if (ret) >>> + ret = -EOPNOTSUPP; // return not support if failed get Nitpick - comment style >> smu_version >>> + >>> + if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION) >>> + ret = -EOPNOTSUPP; >>> + >>> + return ret; >>> +} >>> + >>> +static ssize_t aldebaran_get_ecc_info(struct smu_context *smu, >>> + void *table) >>> +{ >>> + struct smu_table_context *smu_table = &smu->smu_table; >>> + EccInfoTable_t *ecc_table = NULL; >>> + struct ecc_info_per_ch *ecc_info_per_channel = NULL; >>> + int i, ret = 0; >>> + struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table; >>> + >> >> Missed to ask last time. Since umc_ecc_info is a common struct, do >> you also want to pass back the number of channels having data? >> >> Now this struct can hold max of 32 channel data. Let's say if the >> same interface is going to be used on another ASIC X having only 16 channels. >> Then the callback for ASIC X fills data only for 16 channels. Or, you >> expect that to be taken care at the caller side? > > [Yang, Stanley] : If ASIC X have only 16 channels, the callback only fill data for 16 channels, and caller side also need consider its own channel number to handle with umc_ecc_info. > Thanks for the details. With the nitpick above and Evan's comments on the patch subject addressed, patches 1 and 3 are Reviewed-by: Lijo Lazar <lijo.lazar@xxxxxxx> 2 and 4 look good to me. Hawking or John should a take look though. Thanks, Lijo >> >> Thanks, >> Lijo >> >>> + ret = aldebaran_check_ecc_table_support(smu); >>> + if (ret) >>> + return ret; >>> + >>> + ret = smu_cmn_update_table(smu, >>> + SMU_TABLE_ECCINFO, >>> + 0, >>> + smu_table->ecc_table, >>> + false); >>> + if (ret) { >>> + dev_info(smu->adev->dev, "Failed to export SMU ecc >> table!\n"); >>> + return ret; >>> + } >>> + >>> + ecc_table = (EccInfoTable_t *)smu_table->ecc_table; >>> + >>> + for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) { >>> + ecc_info_per_channel = &(eccinfo->ecc[i]); >>> + ecc_info_per_channel->ce_count_lo_chip = >>> + ecc_table->EccInfo[i].ce_count_lo_chip; >>> + ecc_info_per_channel->ce_count_hi_chip = >>> + ecc_table->EccInfo[i].ce_count_hi_chip; >>> + ecc_info_per_channel->mca_umc_status = >>> + ecc_table->EccInfo[i].mca_umc_status; >>> + ecc_info_per_channel->mca_umc_addr = >>> + ecc_table->EccInfo[i].mca_umc_addr; >>> + } >>> + >>> + return ret; >>> +} >>> + >>> static int aldebaran_mode1_reset(struct smu_context *smu) >>> { >>> u32 smu_version, fatal_err, param; @@ -1967,6 +2036,7 @@ static >>> const struct pptable_funcs >> aldebaran_ppt_funcs = { >>> .i2c_init = aldebaran_i2c_control_init, >>> .i2c_fini = aldebaran_i2c_control_fini, >>> .send_hbm_bad_pages_num = >> aldebaran_smu_send_hbm_bad_page_num, >>> + .get_ecc_info = aldebaran_get_ecc_info, >>> }; >>> >>> void aldebaran_set_ppt_funcs(struct smu_context *smu) diff --git >>> a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c >>> b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c >>> index 4d96099a9bb1..55421ea622fb 100644 >>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c >>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c >>> @@ -428,8 +428,10 @@ int smu_v13_0_fini_smc_tables(struct >> smu_context *smu) >>> kfree(smu_table->hardcode_pptable); >>> smu_table->hardcode_pptable = NULL; >>> >>> + kfree(smu_table->ecc_table); >>> kfree(smu_table->metrics_table); >>> kfree(smu_table->watermarks_table); >>> + smu_table->ecc_table = NULL; >>> smu_table->metrics_table = NULL; >>> smu_table->watermarks_table = NULL; >>> smu_table->metrics_time = 0; >>>