Thanks tao, will update. > -----邮件原件----- > 发件人: Zhou1, Tao <Tao.Zhou1@xxxxxxx> > 发送时间: Monday, May 23, 2022 6:22 PM > 收件人: Yang, Stanley <Stanley.Yang@xxxxxxx>; amd- > gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Quan, > Evan <Evan.Quan@xxxxxxx>; Lazar, Lijo <Lijo.Lazar@xxxxxxx> > 抄送: Yang, Stanley <Stanley.Yang@xxxxxxx> > 主题: RE: [PATCH Review 2/2] drm/amdgpu: print umc correctable error address > > [AMD Official Use Only - General] > > > > > -----Original Message----- > > From: Stanley.Yang <Stanley.Yang@xxxxxxx> > > Sent: Monday, May 23, 2022 4:17 PM > > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking > > <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Quan, > Evan > > <Evan.Quan@xxxxxxx>; Lazar, Lijo <Lijo.Lazar@xxxxxxx> > > Cc: Yang, Stanley <Stanley.Yang@xxxxxxx> > > Subject: [PATCH Review 2/2] drm/amdgpu: print umc correctable error > > address > > > > Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx> > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 5 ++ > > drivers/gpu/drm/amd/amdgpu/umc_v6_7.c | 55 > > ++++++++++++++++++- > > .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 2 + > > 3 files changed, 60 insertions(+), 2 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > > index 3f23f9ad3249..985b8cddb5a1 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > > @@ -1108,6 +1108,11 @@ struct amdgpu_device { > > > > bool scpm_enabled; > > uint32_t scpm_status; > > + > > + /* Determine smu ecctable whether support > > + * record correctable error address > > + */ > > + int record_ce_addr_supported; > > }; > > > > static inline struct amdgpu_device *drm_to_adev(struct drm_device > > *ddev) diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c > > b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c > > index 606892dbea1c..47bd39e52e9b 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c > > +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c > > @@ -119,6 +119,27 @@ static void > > umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device > > *error_count += 1; > > > > umc_v6_7_query_error_status_helper(adev, > > mc_umc_status, umc_reg_offset); > > + > > + if (adev->record_ce_addr_supported) { > > + uint64_t err_addr, soc_pa; > > + uint32_t channel_index = > > + adev->umc.channel_idx_tbl[umc_inst * > > adev->umc.channel_inst_num + > > +ch_inst]; > > + > > + err_addr = ras- > > >umc_ecc.ecc[eccinfo_table_idx].mca_ceumc_addr; > > + err_addr = REG_GET_FIELD(err_addr, > > MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); > > + /* translate umc channel address to soc pa, 3 parts > > are included */ > > + soc_pa = ADDR_OF_8KB_BLOCK(err_addr) | > > + > > ADDR_OF_256B_BLOCK(channel_index) | > > + OFFSET_IN_256B_BLOCK(err_addr); > > + > > + /* The umc channel bits are not original values, they > > are hashed */ > > + SET_CHANNEL_HASH(channel_index, soc_pa); > > + > > + /* clear [C4 C3 C2] in soc physical address */ > > + soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT); > > [Tao] this clear is the preparation for looping all column bits in same row, you > only need physical address of one page, the code can be removed. > > > + > > + dev_info(adev->dev, "Error Address(PA): 0x%llx\n", > > soc_pa); > > + } > > } > > } > > > > @@ -251,7 +272,9 @@ static void > > umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev > > > > static void umc_v6_7_query_correctable_error_count(struct > > amdgpu_device *adev, > > uint32_t umc_reg_offset, > > - unsigned long > > *error_count) > > + unsigned long > > *error_count, > > + uint32_t ch_inst, > > + uint32_t umc_inst) > > { > > uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; > > uint32_t ecc_err_cnt, ecc_err_cnt_addr; @@ -295,6 +318,33 @@ static > > void umc_v6_7_query_correctable_error_count(struct > > amdgpu_device *adev, > > *error_count += 1; > > > > umc_v6_7_query_error_status_helper(adev, > > mc_umc_status, umc_reg_offset); > > + > > + { > > + uint64_t err_addr, soc_pa; > > + uint32_t mc_umc_addrt0; > > + uint32_t channel_index; > > + > > + mc_umc_addrt0 = > > + SOC15_REG_OFFSET(UMC, 0, > > regMCA_UMC_UMC0_MCUMC_ADDRT0); > > + > > + channel_index = > > + adev->umc.channel_idx_tbl[umc_inst * > > adev->umc.channel_inst_num + > > +ch_inst]; > > + > > + err_addr = RREG64_PCIE((mc_umc_addrt0 + > > umc_reg_offset) * 4); > > + err_addr = REG_GET_FIELD(err_addr, > > MCA_UMC_UMC0_MCUMC_ADDRT0, > > +ErrorAddr); > > + > > + /* translate umc channel address to soc pa, 3 parts > > are included */ > > + soc_pa = ADDR_OF_8KB_BLOCK(err_addr) | > > + > > ADDR_OF_256B_BLOCK(channel_index) | > > + OFFSET_IN_256B_BLOCK(err_addr); > > + > > + /* The umc channel bits are not original values, they > > are hashed */ > > + SET_CHANNEL_HASH(channel_index, soc_pa); > > + > > + /* clear [C4 C3 C2] in soc physical address */ > > + soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT); > > + dev_info(adev->dev, "Error Address(PA): 0x%llx\n", > > soc_pa); > > + } > > } > > } > > > > @@ -395,7 +445,8 @@ static void umc_v6_7_query_ras_error_count(struct > > amdgpu_device *adev, > > ch_inst); > > umc_v6_7_query_correctable_error_count(adev, > > umc_reg_offset, > > - &(err_data->ce_count)); > > + &(err_data->ce_count), > > + ch_inst, umc_inst); > > umc_v6_7_querry_uncorrectable_error_count(adev, > > umc_reg_offset, > > &(err_data- > > >ue_count)); > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > > b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > > index e58df9490cec..e182088b4ac8 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > > @@ -1908,6 +1908,8 @@ static ssize_t aldebaran_get_ecc_info(struct > > smu_context *smu, > > ecc_table_v2->EccInfo[i].mca_umc_addr; > > ecc_info_per_channel->mca_ceumc_addr = > > ecc_table_v2->EccInfo[i].mca_ceumc_addr; > > + if (!smu->adev->record_ce_addr_supported) > > [Tao] it seems the check is unnecessary. > > > + smu->adev->record_ce_addr_supported =1; > > } > > } > > > > -- > > 2.17.1