[AMD Official Use Only - General] > -----Original Message----- > From: Chai, Thomas <YiPeng.Chai@xxxxxxx> > Sent: Thursday, April 18, 2024 10:59 AM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Chai, Thomas <YiPeng.Chai@xxxxxxx>; Zhang, Hawking > <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Li, Candice > <Candice.Li@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Yang, > Stanley <Stanley.Yang@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx> > Subject: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0 > > Retire bad pages for umc v12_0. > > Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 57 > +++++++++++++++++++++++++- > 1 file changed, 55 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c > b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c > index 6c2b61ef5b57..bd917eb6ea24 100644 > --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c > @@ -28,6 +28,8 @@ > #include "umc/umc_12_0_0_sh_mask.h" > #include "mp/mp_13_0_6_sh_mask.h" > > +#define MAX_ECC_NUM_PER_RETIREMENT 16 [Tao] we already have UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL for the purposes > + > static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev, > uint32_t node_inst, > uint32_t umc_inst, > @@ -633,6 +635,58 @@ static int umc_v12_0_update_ecc_status(struct > amdgpu_device *adev, > return 0; > } > > +static int umc_v12_0_fill_error_record(struct amdgpu_device *adev, > + struct ras_ecc_err *ecc_err, void > *ras_error_status) { > + struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; > + uint32_t i = 0; > + int ret = 0; > + > + if (!err_data || !ecc_err) > + return -EINVAL; > + > + for (i = 0; i < ecc_err->err_pages.count; i++) { > + ret = amdgpu_umc_fill_error_record(err_data, > + ecc_err->addr, > + ecc_err->err_pages.pfn[i] << > AMDGPU_GPU_PAGE_SHIFT, > + MCA_IPID_2_UMC_CH(ecc_err->ipid), > + MCA_IPID_2_UMC_INST(ecc_err->ipid)); > + if (ret) > + break; > + } > + > + err_data->de_count++; > + > + return ret; > +} > + > +static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev, > + void *ras_error_status) > +{ > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT]; > + struct radix_tree_root *ecc_tree; > + int new_detected, ret, i; > + > + ecc_tree = &con->umc_ecc_log.de_page_tree; > + > + mutex_lock(&con->umc_ecc_log.lock); > + new_detected = radix_tree_gang_lookup_tag(ecc_tree, (void **)entries, > + 0, ARRAY_SIZE(entries), > UMC_ECC_NEW_DETECTED_TAG); > + for (i = 0; i < new_detected; i++) { > + if (!entries[i]) > + continue; > + > + ret = umc_v12_0_fill_error_record(adev, entries[i], > ras_error_status); > + if (ret) { > + dev_err(adev->dev, "Fail to fill umc error record, > ret:%d\n", ret); > + break; > + } > + radix_tree_tag_clear(ecc_tree, entries[i]->hash_index, > UMC_ECC_NEW_DETECTED_TAG); > + } > + mutex_unlock(&con->umc_ecc_log.lock); > +} > + > struct amdgpu_umc_ras umc_v12_0_ras = { > .ras_block = { > .hw_ops = &umc_v12_0_ras_hw_ops, > @@ -640,8 +694,7 @@ struct amdgpu_umc_ras umc_v12_0_ras = { > }, > .err_cnt_init = umc_v12_0_err_cnt_init, > .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode, > - .ecc_info_query_ras_error_count = > umc_v12_0_ecc_info_query_ras_error_count, > - .ecc_info_query_ras_error_address = > umc_v12_0_ecc_info_query_ras_error_address, > + .ecc_info_query_ras_error_address = > umc_v12_0_query_ras_ecc_err_addr, > .check_ecc_err_status = umc_v12_0_check_ecc_err_status, > .update_ecc_status = umc_v12_0_update_ecc_status, }; > -- > 2.34.1