[AMD Official Use Only - General] > -----Original Message----- > From: Stanley.Yang <Stanley.Yang@xxxxxxx> > Sent: Monday, May 23, 2022 4:17 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking > <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Quan, > Evan <Evan.Quan@xxxxxxx>; Lazar, Lijo <Lijo.Lazar@xxxxxxx> > Cc: Yang, Stanley <Stanley.Yang@xxxxxxx> > Subject: [PATCH Review 2/2] drm/amdgpu: print umc correctable error > address > > Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 5 ++ > drivers/gpu/drm/amd/amdgpu/umc_v6_7.c | 55 > ++++++++++++++++++- > .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 2 + > 3 files changed, 60 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 3f23f9ad3249..985b8cddb5a1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1108,6 +1108,11 @@ struct amdgpu_device { > > bool scpm_enabled; > uint32_t scpm_status; > + > + /* Determine smu ecctable whether support > + * record correctable error address > + */ > + int record_ce_addr_supported; > }; > > static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) > diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c > b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c > index 606892dbea1c..47bd39e52e9b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c > +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c > @@ -119,6 +119,27 @@ static void > umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device > *error_count += 1; > > umc_v6_7_query_error_status_helper(adev, > mc_umc_status, umc_reg_offset); > + > + if (adev->record_ce_addr_supported) { > + uint64_t err_addr, soc_pa; > + uint32_t channel_index = > + adev->umc.channel_idx_tbl[umc_inst * > adev->umc.channel_inst_num + > +ch_inst]; > + > + err_addr = ras- > >umc_ecc.ecc[eccinfo_table_idx].mca_ceumc_addr; > + err_addr = REG_GET_FIELD(err_addr, > MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); > + /* translate umc channel address to soc pa, 3 parts > are included */ > + soc_pa = ADDR_OF_8KB_BLOCK(err_addr) | > + > ADDR_OF_256B_BLOCK(channel_index) | > + OFFSET_IN_256B_BLOCK(err_addr); > + > + /* The umc channel bits are not original values, they > are hashed */ > + SET_CHANNEL_HASH(channel_index, soc_pa); > + > + /* clear [C4 C3 C2] in soc physical address */ > + soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT); [Tao] this clear is the preparation for looping all column bits in same row, you only need physical address of one page, the code can be removed. > + > + dev_info(adev->dev, "Error Address(PA): 0x%llx\n", > soc_pa); > + } > } > } > > @@ -251,7 +272,9 @@ static void > umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev > > static void umc_v6_7_query_correctable_error_count(struct > amdgpu_device *adev, > uint32_t umc_reg_offset, > - unsigned long > *error_count) > + unsigned long > *error_count, > + uint32_t ch_inst, > + uint32_t umc_inst) > { > uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; > uint32_t ecc_err_cnt, ecc_err_cnt_addr; @@ -295,6 +318,33 @@ > static void umc_v6_7_query_correctable_error_count(struct > amdgpu_device *adev, > *error_count += 1; > > umc_v6_7_query_error_status_helper(adev, > mc_umc_status, umc_reg_offset); > + > + { > + uint64_t err_addr, soc_pa; > + uint32_t mc_umc_addrt0; > + uint32_t channel_index; > + > + mc_umc_addrt0 = > + SOC15_REG_OFFSET(UMC, 0, > regMCA_UMC_UMC0_MCUMC_ADDRT0); > + > + channel_index = > + adev->umc.channel_idx_tbl[umc_inst * > adev->umc.channel_inst_num + > +ch_inst]; > + > + err_addr = RREG64_PCIE((mc_umc_addrt0 + > umc_reg_offset) * 4); > + err_addr = REG_GET_FIELD(err_addr, > MCA_UMC_UMC0_MCUMC_ADDRT0, > +ErrorAddr); > + > + /* translate umc channel address to soc pa, 3 parts > are included */ > + soc_pa = ADDR_OF_8KB_BLOCK(err_addr) | > + > ADDR_OF_256B_BLOCK(channel_index) | > + OFFSET_IN_256B_BLOCK(err_addr); > + > + /* The umc channel bits are not original values, they > are hashed */ > + SET_CHANNEL_HASH(channel_index, soc_pa); > + > + /* clear [C4 C3 C2] in soc physical address */ > + soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT); > + dev_info(adev->dev, "Error Address(PA): 0x%llx\n", > soc_pa); > + } > } > } > > @@ -395,7 +445,8 @@ static void umc_v6_7_query_ras_error_count(struct > amdgpu_device *adev, > ch_inst); > umc_v6_7_query_correctable_error_count(adev, > umc_reg_offset, > - &(err_data->ce_count)); > + &(err_data->ce_count), > + ch_inst, umc_inst); > umc_v6_7_querry_uncorrectable_error_count(adev, > umc_reg_offset, > &(err_data- > >ue_count)); > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > index e58df9490cec..e182088b4ac8 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > @@ -1908,6 +1908,8 @@ static ssize_t aldebaran_get_ecc_info(struct > smu_context *smu, > ecc_table_v2->EccInfo[i].mca_umc_addr; > ecc_info_per_channel->mca_ceumc_addr = > ecc_table_v2->EccInfo[i].mca_ceumc_addr; > + if (!smu->adev->record_ce_addr_supported) [Tao] it seems the check is unnecessary. > + smu->adev->record_ce_addr_supported =1; > } > } > > -- > 2.17.1