> -----Original Message----- > From: Stanley.Yang <Stanley.Yang@xxxxxxx> > Sent: Tuesday, May 24, 2022 10:31 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking > <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Quan, Evan > <Evan.Quan@xxxxxxx>; Lazar, Lijo <Lijo.Lazar@xxxxxxx> > Cc: Yang, Stanley <Stanley.Yang@xxxxxxx> > Subject: [PATCH Review v2 2/2] drm/amdgpu: print umc correctable error > address > > Changed from V1: > remove unnecessary same row physical address calculation > > Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 5 ++ > drivers/gpu/drm/amd/amdgpu/umc_v6_7.c | 52 ++++++++++++++++++- > .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 1 + > 3 files changed, 56 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 3f23f9ad3249..985b8cddb5a1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1108,6 +1108,11 @@ struct amdgpu_device { > > bool scpm_enabled; > uint32_t scpm_status; > + > + /* Determine smu ecctable whether support > + * record correctable error address > + */ > + int record_ce_addr_supported; > }; > > static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) diff > --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c > b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c > index 606892dbea1c..91bdc5e048c2 100644 > --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c > +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c > @@ -119,6 +119,24 @@ static void > umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device > *error_count += 1; > > umc_v6_7_query_error_status_helper(adev, mc_umc_status, > umc_reg_offset); > + > + if (adev->record_ce_addr_supported) { > + uint64_t err_addr, soc_pa; > + uint32_t channel_index = > + adev->umc.channel_idx_tbl[umc_inst * adev- > >umc.channel_inst_num + > +ch_inst]; > + > + err_addr = ras- > >umc_ecc.ecc[eccinfo_table_idx].mca_ceumc_addr; > + err_addr = REG_GET_FIELD(err_addr, > MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); > + /* translate umc channel address to soc pa, 3 parts are > included */ > + soc_pa = ADDR_OF_8KB_BLOCK(err_addr) | > + ADDR_OF_256B_BLOCK(channel_index) > | > + OFFSET_IN_256B_BLOCK(err_addr); > + > + /* The umc channel bits are not original values, they are > hashed */ > + SET_CHANNEL_HASH(channel_index, soc_pa); > + > + dev_info(adev->dev, "Error Address(PA): 0x%llx\n", > soc_pa); > + } > } > } > > @@ -251,7 +269,9 @@ static void > umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev > > static void umc_v6_7_query_correctable_error_count(struct amdgpu_device > *adev, > uint32_t umc_reg_offset, > - unsigned long *error_count) > + unsigned long *error_count, > + uint32_t ch_inst, > + uint32_t umc_inst) > { > uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; > uint32_t ecc_err_cnt, ecc_err_cnt_addr; @@ -295,6 +315,33 @@ static > void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev, > *error_count += 1; > > umc_v6_7_query_error_status_helper(adev, mc_umc_status, > umc_reg_offset); > + > + { > + uint64_t err_addr, soc_pa; > + uint32_t mc_umc_addrt0; > + uint32_t channel_index; > + > + mc_umc_addrt0 = > + SOC15_REG_OFFSET(UMC, 0, > regMCA_UMC_UMC0_MCUMC_ADDRT0); > + > + channel_index = > + adev->umc.channel_idx_tbl[umc_inst * adev- > >umc.channel_inst_num + > +ch_inst]; > + > + err_addr = RREG64_PCIE((mc_umc_addrt0 + > umc_reg_offset) * 4); > + err_addr = REG_GET_FIELD(err_addr, > MCA_UMC_UMC0_MCUMC_ADDRT0, > +ErrorAddr); > + > + /* translate umc channel address to soc pa, 3 parts are > included */ > + soc_pa = ADDR_OF_8KB_BLOCK(err_addr) | > + ADDR_OF_256B_BLOCK(channel_index) > | > + OFFSET_IN_256B_BLOCK(err_addr); > + > + /* The umc channel bits are not original values, they are > hashed */ > + SET_CHANNEL_HASH(channel_index, soc_pa); > + > + /* clear [C4 C3 C2] in soc physical address */ > + soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT); [Tao] this clear needs to be dropped as well. > + dev_info(adev->dev, "Error Address(PA): 0x%llx\n", > soc_pa); > + } > } > } > > @@ -395,7 +442,8 @@ static void umc_v6_7_query_ras_error_count(struct > amdgpu_device *adev, > ch_inst); > umc_v6_7_query_correctable_error_count(adev, > umc_reg_offset, > - &(err_data->ce_count)); > + &(err_data->ce_count), > + ch_inst, umc_inst); > umc_v6_7_querry_uncorrectable_error_count(adev, > umc_reg_offset, > &(err_data- > >ue_count)); > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > index 9cdfeea58085..e41a5b6fc64b 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > @@ -1882,6 +1882,7 @@ static ssize_t aldebaran_get_ecc_info(struct > smu_context *smu, > ecc_table->EccInfo_V2[i].mca_umc_addr; > ecc_info_per_channel->mca_ceumc_addr = > ecc_table->EccInfo_V2[i].mca_ceumc_addr; > + smu->adev->record_ce_addr_supported =1; > } > } > > -- > 2.17.1