[AMD Official Use Only] > -----邮件原件----- > 发件人: Lazar, Lijo <Lijo.Lazar@xxxxxxx> > 发送时间: Wednesday, November 17, 2021 7:15 PM > 收件人: Yang, Stanley <Stanley.Yang@xxxxxxx>; amd- > gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; > Clements, John <John.Clements@xxxxxxx>; Quan, Evan > <Evan.Quan@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx> > 主题: Re: [PATCH Review 4/4] query umc error info from ecc_table > > > > On 11/17/2021 3:41 PM, Stanley.Yang wrote: > > if smu support ECCTABLE, driver can message smu to get ecc_table then > > query umc error info from ECCTABLE apply pmfw version check to ensure > > backward compatibility > > > > Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx> > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 42 ++++++++--- > > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 7 ++ > > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 71 +++++++++++++-- > ---- > > drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h | 1 + > > drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 12 ++++ > > .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 4 ++ > > 6 files changed, 107 insertions(+), 30 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > index 90f0db3b4f65..6b0f2ba1e420 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > @@ -888,6 +888,38 @@ void amdgpu_ras_mca_query_error_status(struct > amdgpu_device *adev, > > } > > } > > > > +static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, > > +struct ras_err_data *err_data) { > > + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); > > + > > + /* > > + * choosing right query method according to > > + * whether smu support query error information > > + */ > > + if ((ras->smu_version >= SUPPORT_ECCTABLE_SMU_VERSION) && > > + !smu_get_ecc_info(&adev->smu, (void *)&(ras- > >umc_ecc))) { > > + > > This version check should be in aldebaran_ppt implementation. In general > the callback will check the FW version that supports ECC table for the > corresponding ASIC. It may return ENOTSUPP or similar if the FW version > doesn't support ECC table and that may be checked here. Keeping > smu_version in ras context is not needed. [Yang, Stanley] I think just check Aldebaran_ppt callback function is not enough here, considering this scenario using amdgpu driver with get_ecc_info callback function but the pmfw is an older one without ecctable feature. PMFW support ecctable since 68.42.0 for Aldebaran. > > > + if (adev->umc.ras_funcs && > > + adev->umc.ras_funcs- > >message_smu_query_ras_error_count) > > + adev->umc.ras_funcs- > >message_smu_query_ras_error_count(adev, > > +err_data); > > + > > + if (adev->umc.ras_funcs && > > + adev->umc.ras_funcs- > >message_smu_query_ras_error_address) > > + adev->umc.ras_funcs- > >message_smu_query_ras_error_address(adev, err_data); > > + } else { > > + if (adev->umc.ras_funcs && > > + adev->umc.ras_funcs->query_ras_error_count) > > + adev->umc.ras_funcs->query_ras_error_count(adev, > err_data); > > + > > + /* umc query_ras_error_address is also responsible for > clearing > > + * error status > > + */ > > + if (adev->umc.ras_funcs && > > + adev->umc.ras_funcs->query_ras_error_address) > > + adev->umc.ras_funcs- > >query_ras_error_address(adev, err_data); > > + } > > +} > > + > > /* query/inject/cure begin */ > > int amdgpu_ras_query_error_status(struct amdgpu_device *adev, > > struct ras_query_if *info) > > @@ -901,15 +933,7 @@ int amdgpu_ras_query_error_status(struct > > amdgpu_device *adev, > > > > switch (info->head.block) { > > case AMDGPU_RAS_BLOCK__UMC: > > - if (adev->umc.ras_funcs && > > - adev->umc.ras_funcs->query_ras_error_count) > > - adev->umc.ras_funcs->query_ras_error_count(adev, > &err_data); > > - /* umc query_ras_error_address is also responsible for > clearing > > - * error status > > - */ > > - if (adev->umc.ras_funcs && > > - adev->umc.ras_funcs->query_ras_error_address) > > - adev->umc.ras_funcs- > >query_ras_error_address(adev, &err_data); > > + amdgpu_ras_get_ecc_info(adev, &err_data); > > break; > > case AMDGPU_RAS_BLOCK__SDMA: > > if (adev->sdma.funcs->query_ras_error_count) { diff --git > > a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > > index bcbf3264d92f..3f0de0cc8403 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > > @@ -322,6 +322,12 @@ struct ras_common_if { > > > > #define MAX_UMC_CHANNEL_NUM 32 > > > > +/* > > + * SMU support ECCTABLE since version 68.42.0, > > + * use this to decide query umc error info method */ #define > > +SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00 > > + > > struct ecc_info_per_ch { > > uint16_t ce_count_lo_chip; > > uint16_t ce_count_hi_chip; > > @@ -375,6 +381,7 @@ struct amdgpu_ras { > > > > /* record umc error info queried from smu */ > > struct umc_ecc_info umc_ecc; > > + uint32_t smu_version; > > }; > > > > struct ras_fs_data { > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > > index 0c7c56a91b25..2c3e97c9410b 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > > @@ -97,28 +97,57 @@ int amdgpu_umc_process_ras_data_cb(struct > amdgpu_device *adev, > > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > > > > kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); > > - if (adev->umc.ras_funcs && > > - adev->umc.ras_funcs->query_ras_error_count) > > - adev->umc.ras_funcs->query_ras_error_count(adev, > ras_error_status); > > > > - if (adev->umc.ras_funcs && > > - adev->umc.ras_funcs->query_ras_error_address && > > - adev->umc.max_ras_err_cnt_per_query) { > > - err_data->err_addr = > > - kcalloc(adev->umc.max_ras_err_cnt_per_query, > > - sizeof(struct eeprom_table_record), > GFP_KERNEL); > > - > > - /* still call query_ras_error_address to clear error status > > - * even NOMEM error is encountered > > - */ > > - if(!err_data->err_addr) > > - dev_warn(adev->dev, "Failed to alloc memory for " > > - "umc error address record!\n"); > > - > > - /* umc query_ras_error_address is also responsible for > clearing > > - * error status > > - */ > > - adev->umc.ras_funcs->query_ras_error_address(adev, > ras_error_status); > > + if ((con->smu_version >= SUPPORT_ECCTABLE_SMU_VERSION) && > > + !smu_get_ecc_info(&adev->smu, (void *)&(con- > >umc_ecc))) { > > + > Same comment as above. > > > + if (adev->umc.ras_funcs && > > + adev->umc.ras_funcs- > >message_smu_query_ras_error_count) > > + adev->umc.ras_funcs- > >message_smu_query_ras_error_count(adev, > > +ras_error_status); > > + > > + if (adev->umc.ras_funcs && > > + adev->umc.ras_funcs- > >message_smu_query_ras_error_address && > > + adev->umc.max_ras_err_cnt_per_query) { > > + err_data->err_addr = > > + kcalloc(adev- > >umc.max_ras_err_cnt_per_query, > > + sizeof(struct eeprom_table_record), > GFP_KERNEL); > > + > > + /* still call query_ras_error_address to clear error > status > > + * even NOMEM error is encountered > > + */ > > + if(!err_data->err_addr) > > + dev_warn(adev->dev, "Failed to alloc > memory for " > > + "umc error address > record!\n"); > > + > > + /* umc query_ras_error_address is also responsible > for clearing > > + * error status > > + */ > > + adev->umc.ras_funcs- > >message_smu_query_ras_error_address(adev, ras_error_status); > > + } > > + } else { > > + if (adev->umc.ras_funcs && > > + adev->umc.ras_funcs->query_ras_error_count) > > + adev->umc.ras_funcs->query_ras_error_count(adev, > > +ras_error_status); > > + > > + if (adev->umc.ras_funcs && > > + adev->umc.ras_funcs->query_ras_error_address && > > + adev->umc.max_ras_err_cnt_per_query) { > > + err_data->err_addr = > > + kcalloc(adev- > >umc.max_ras_err_cnt_per_query, > > + sizeof(struct eeprom_table_record), > GFP_KERNEL); > > + > > + /* still call query_ras_error_address to clear error > status > > + * even NOMEM error is encountered > > + */ > > + if(!err_data->err_addr) > > + dev_warn(adev->dev, "Failed to alloc > memory for " > > + "umc error address > record!\n"); > > + > > + /* umc query_ras_error_address is also responsible > for clearing > > + * error status > > + */ > > + adev->umc.ras_funcs- > >query_ras_error_address(adev, ras_error_status); > > + } > > } > > > > /* only uncorrectable error needs gpu reset */ diff --git > > a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h > > b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h > > index ea65de0160c3..7a06021a58f0 100644 > > --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h > > +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h > > @@ -1404,6 +1404,7 @@ int smu_set_light_sbr(struct smu_context *smu, > > bool enable); > > > > int smu_wait_for_event(struct amdgpu_device *adev, enum > smu_event_type event, > > uint64_t event_arg); > > +int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc); > > > > #endif > > #endif > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > > b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > > index 01168b8955bf..6340c079f35e 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > > +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > > @@ -3072,6 +3072,18 @@ int smu_set_light_sbr(struct smu_context *smu, > bool enable) > > return ret; > > } > > > > +int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc) { > > + int ret = -1; > > + > > + if (smu->ppt_funcs && > > + smu->ppt_funcs->get_ecc_info) > > + ret = smu->ppt_funcs->get_ecc_info(smu, umc_ecc); > > + > > Shouldn't return -1 if ppt func is not present. If ppt func is not present, that > means this method is not supported for the SOC; return ENOTSUPP. [Yang, Stanley] thanks, return -ENOTSUPP is more reasonable. > > > + return ret; > > + > > +} > > + > > Probably the above function should be clubbed with patch 3 - smu support > for getting ras ecc info. > > > static int smu_get_prv_buffer_details(void *handle, void **addr, size_t > *size) > > { > > struct smu_context *smu = handle; > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > > b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > > index 55421ea622fb..55ef10ca684a 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > > @@ -200,11 +200,15 @@ int smu_v13_0_check_fw_version(struct > smu_context *smu) > > uint16_t smu_major; > > uint8_t smu_minor, smu_debug; > > int ret = 0; > > + struct amdgpu_ras *ras = amdgpu_ras_get_context(smu->adev); > > > > ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version); > > if (ret) > > return ret; > > > > + /* record smu interface version, help umc query error method */ > > + ras->smu_version = smu_version; > > + > > This is not needed. ASIC specific functions can check the FW version for ECC > table support. > > Thanks, > Lijo > > > smu_major = (smu_version >> 16) & 0xffff; > > smu_minor = (smu_version >> 8) & 0xff; > > smu_debug = (smu_version >> 0) & 0xff; > >