[AMD Official Use Only - General] Hi Lijo, +@Joo, Maria > -----邮件原件----- > 发件人: Lazar, Lijo <Lijo.Lazar@xxxxxxx> > 发送时间: Monday, May 23, 2022 5:12 PM > 收件人: Yang, Stanley <Stanley.Yang@xxxxxxx>; amd- > gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, > Tao <Tao.Zhou1@xxxxxxx>; Quan, Evan <Evan.Quan@xxxxxxx> > 主题: Re: [PATCH Review 1/2] drm/amdgpu/pm: support mca_ceumc_addr in > ecctable > > > > On 5/23/2022 1:47 PM, Stanley.Yang wrote: > > SMU add a new variable mca_ceumc_addr to record umc correctable error > > address in EccInfo table, driver side add ecctable_v2 to support this > > feature > > > > Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx> > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + > > drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 2 + > > .../inc/pmfw_if/smu13_driver_if_aldebaran.h | 15 +++ > > .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 101 ++++++++++++++---- > > .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 2 + > > 5 files changed, 98 insertions(+), 23 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > > index b9a6fac2b8b2..28e603243b67 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > > @@ -328,6 +328,7 @@ struct ecc_info_per_ch { > > uint16_t ce_count_hi_chip; > > uint64_t mca_umc_status; > > uint64_t mca_umc_addr; > > + uint64_t mca_ceumc_addr; > > }; > > > > struct umc_ecc_info { > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > > b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > > index a6a7b6c33683..9f7257ada437 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > > +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > > @@ -322,6 +322,7 @@ enum smu_table_id > > SMU_TABLE_PACE, > > SMU_TABLE_ECCINFO, > > SMU_TABLE_COMBO_PPTABLE, > > + SMU_TABLE_ECCINFO_V2, > > Hi Stanley, > > This is not the right approach. Need to ask FW team to fix this. There shouldn't > be any new id with each version of table. You may check Sienna Cichlid smu > metrics table as an example and ask FW team to follow something similar. I > don't see 68.55 being released, so it's not late anyway. We don't need to keep > defining pointers in table context per version of ECC table. > > Thanks, > Lijo [Yang, Stanley] : There is not enough padding space in ecc_table, you can check below struct, the new added variable is uint32_t type, I think smu can't add uint32_t type in ecc_table directly without change ecc_tabe size. If you have any better approach, we can discuss a better method to complete it. 512 typedef struct { 513 uint64_t mca_umc_status; 514 uint64_t mca_umc_addr; 515 uint16_t ce_count_lo_chip; 516 uint16_t ce_count_hi_chip; 517 518 uint32_t eccPadding; 519 } EccInfo_t; Thanks, Stanley > > > SMU_TABLE_COUNT, > > }; > > > > @@ -340,6 +341,7 @@ struct smu_table_context > > void *driver_pptable; > > void *combo_pptable; > > void *ecc_table; > > + void *ecc_table_v2; // adapt to smu support record > mca_ceumc_addr > > void *driver_smu_config_table; > > struct smu_table tables[SMU_TABLE_COUNT]; > > /* > > diff --git > > > a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h > > > b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h > > index 0f67c56c2863..2868604eff49 100644 > > --- > > > a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebaran.h > > +++ > b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_aldebar > > +++ an.h > > @@ -522,6 +522,21 @@ typedef struct { > > EccInfo_t EccInfo[ALDEBARAN_UMC_CHANNEL_NUM]; > > } EccInfoTable_t; > > > > +typedef struct { > > + uint64_t mca_umc_status; > > + uint64_t mca_umc_addr; > > + uint64_t mca_ceumc_addr; > > + > > + uint16_t ce_count_lo_chip; > > + uint16_t ce_count_hi_chip; > > + > > + uint32_t eccPadding; > > +} EccInfo_t_v2; > > + > > +typedef struct { > > + EccInfo_t_v2 EccInfo[ALDEBARAN_UMC_CHANNEL_NUM]; > > +} EccInfoTable_t_v2; > > + > > // These defines are used with the following messages: > > // SMC_MSG_TransferTableDram2Smu > > // SMC_MSG_TransferTableSmu2Dram > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > > b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > > index 38af648cb857..e58df9490cec 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > > @@ -82,6 +82,12 @@ > > */ > > #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00 > > > > +/* > > + * SMU support mca_ceumc_addr in ECCTABLE since version 68.55.0, > > + * use this to check mca_ceumc_addr record whether support */ > > +#define SUPPORT_ECCTABLE_V2_SMU_VERSION 0x00443700 > > + > > /* > > * SMU support BAD CHENNEL info MSG since version 68.51.00, > > * use this to check ECCTALE feature whether support @@ -239,6 > > +245,9 @@ static int aldebaran_tables_init(struct smu_context *smu) > > SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO, sizeof(EccInfoTable_t), > > PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM); > > > > + SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO_V2, > sizeof(EccInfoTable_t_v2), > > + PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM); > > + > > smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t), GFP_KERNEL); > > if (!smu_table->metrics_table) > > return -ENOMEM; > > @@ -255,6 +264,10 @@ static int aldebaran_tables_init(struct smu_context > *smu) > > if (!smu_table->ecc_table) > > return -ENOMEM; > > > > + smu_table->ecc_table_v2 = > kzalloc(tables[SMU_TABLE_ECCINFO_V2].size, GFP_KERNEL); > > + if (!smu_table->ecc_table_v2) > > + return -ENOMEM;; > > + > > return 0; > > } > > > > @@ -1802,7 +1815,8 @@ static ssize_t aldebaran_get_gpu_metrics(struct > smu_context *smu, > > return sizeof(struct gpu_metrics_v1_3); > > } > > > > -static int aldebaran_check_ecc_table_support(struct smu_context *smu) > > +static int aldebaran_check_ecc_table_support(struct smu_context *smu, > > + int *ecctable_version) > > { > > uint32_t if_version = 0xff, smu_version = 0xff; > > int ret = 0; > > @@ -1815,6 +1829,11 @@ static int > > aldebaran_check_ecc_table_support(struct smu_context *smu) > > > > if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION) > > ret = -EOPNOTSUPP; > > + else if (smu_version >= SUPPORT_ECCTABLE_SMU_VERSION && > > + smu_version < > SUPPORT_ECCTABLE_V2_SMU_VERSION) > > + *ecctable_version = 1; > > + else > > + *ecctable_version = 2; > > > > return ret; > > } > > @@ -1824,36 +1843,72 @@ static ssize_t aldebaran_get_ecc_info(struct > smu_context *smu, > > { > > struct smu_table_context *smu_table = &smu->smu_table; > > EccInfoTable_t *ecc_table = NULL; > > + EccInfoTable_t_v2 *ecc_table_v2 = NULL; > > struct ecc_info_per_ch *ecc_info_per_channel = NULL; > > int i, ret = 0; > > + int table_version = 0; > > struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table; > > > > - ret = aldebaran_check_ecc_table_support(smu); > > + ret = aldebaran_check_ecc_table_support(smu, &table_version); > > if (ret) > > return ret; > > > > - ret = smu_cmn_update_table(smu, > > - SMU_TABLE_ECCINFO, > > - 0, > > - smu_table->ecc_table, > > - false); > > - if (ret) { > > - dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n"); > > - return ret; > > - } > > + if (table_version == 1) { > > + ret = smu_cmn_update_table(smu, > > + SMU_TABLE_ECCINFO, > > + 0, > > + smu_table->ecc_table, > > + false); > > + if (ret) { > > + dev_info(smu->adev->dev, "Failed to export SMU ecc > table!\n"); > > + return ret; > > + } > > + > > + ecc_table = (EccInfoTable_t *)smu_table->ecc_table; > > + > > + for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) { > > + ecc_info_per_channel = &(eccinfo->ecc[i]); > > + ecc_info_per_channel->ce_count_lo_chip = > > + ecc_table->EccInfo[i].ce_count_lo_chip; > > + ecc_info_per_channel->ce_count_hi_chip = > > + ecc_table->EccInfo[i].ce_count_hi_chip; > > + ecc_info_per_channel->mca_umc_status = > > + ecc_table->EccInfo[i].mca_umc_status; > > + ecc_info_per_channel->mca_umc_addr = > > + ecc_table->EccInfo[i].mca_umc_addr; > > + } > > + } else if (table_version == 2) { > > + /* still use SMU_TABLE_ECC_INFO index, > > + * smu 68.55.0 add mca_ceumc_addr variable > > + * in EccInfo_t struct to report correctable > > + * error address and the table_id is not changed > > + */ > > + ret = smu_cmn_update_table(smu, > > + SMU_TABLE_ECCINFO, > > + 0, > > + smu_table->ecc_table_v2, > > + false); > > > > - ecc_table = (EccInfoTable_t *)smu_table->ecc_table; > > - > > - for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) { > > - ecc_info_per_channel = &(eccinfo->ecc[i]); > > - ecc_info_per_channel->ce_count_lo_chip = > > - ecc_table->EccInfo[i].ce_count_lo_chip; > > - ecc_info_per_channel->ce_count_hi_chip = > > - ecc_table->EccInfo[i].ce_count_hi_chip; > > - ecc_info_per_channel->mca_umc_status = > > - ecc_table->EccInfo[i].mca_umc_status; > > - ecc_info_per_channel->mca_umc_addr = > > - ecc_table->EccInfo[i].mca_umc_addr; > > + if (ret) { > > + dev_info(smu->adev->dev, "Failed to export SMU ecc > table_v2!\n"); > > + return ret; > > + } > > + > > + ecc_table_v2 = (EccInfoTable_t_v2 *)smu_table->ecc_table_v2; > > + > > + for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) { > > + ecc_info_per_channel = &(eccinfo->ecc[i]); > > + ecc_info_per_channel->ce_count_lo_chip = > > + ecc_table_v2->EccInfo[i].ce_count_lo_chip; > > + ecc_info_per_channel->ce_count_hi_chip = > > + ecc_table_v2->EccInfo[i].ce_count_hi_chip; > > + ecc_info_per_channel->mca_umc_status = > > + ecc_table_v2->EccInfo[i].mca_umc_status; > > + ecc_info_per_channel->mca_umc_addr = > > + ecc_table_v2->EccInfo[i].mca_umc_addr; > > + ecc_info_per_channel->mca_ceumc_addr = > > + ecc_table_v2->EccInfo[i].mca_ceumc_addr; > > + } > > } > > > > return ret; > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > > b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > > index ae6321af9d88..af2d84a16f3e 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > > @@ -552,9 +552,11 @@ int smu_v13_0_fini_smc_tables(struct smu_context > *smu) > > kfree(smu_table->hardcode_pptable); > > smu_table->hardcode_pptable = NULL; > > > > + kfree(smu_table->ecc_table_v2); > > kfree(smu_table->ecc_table); > > kfree(smu_table->metrics_table); > > kfree(smu_table->watermarks_table); > > + smu_table->ecc_table_v2 = NULL; > > smu_table->ecc_table = NULL; > > smu_table->metrics_table = NULL; > > smu_table->watermarks_table = NULL; > >