[AMD Official Use Only - AMD Internal Distribution Only] ----------------- Best Regards, Thomas -----Original Message----- From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Chai, Thomas Sent: Thursday, July 18, 2024 11:35 AM To: Zhou1, Tao <Tao.Zhou1@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx> Subject: RE: [PATCH 2/3] drm/amdgpu: optimize logging deferred error info [AMD Official Use Only - AMD Internal Distribution Only] [AMD Official Use Only - AMD Internal Distribution Only] ----------------- Best Regards, Thomas -----Original Message----- From: Zhou1, Tao <Tao.Zhou1@xxxxxxx> Sent: Thursday, July 18, 2024 10:57 AM To: Chai, Thomas <YiPeng.Chai@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx> Subject: RE: [PATCH 2/3] drm/amdgpu: optimize logging deferred error info [AMD Official Use Only - AMD Internal Distribution Only] > -----Original Message----- > From: Chai, Thomas <YiPeng.Chai@xxxxxxx> > Sent: Wednesday, July 17, 2024 4:16 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao > <Tao.Zhou1@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Wang, > Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Yang, Stanley > <Stanley.Yang@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx> > Subject: [PATCH 2/3] drm/amdgpu: optimize logging deferred error info > > 1. Use pa_pfn as the radix-tree key index to log > deferred error info. > 2. Use local array to store expanded bad pages. > > Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 14 ++---- > drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 65 ++++++++++++------------- > drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 5 ++ > 4 files changed, 40 insertions(+), 46 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index dcf1f3dbb5c4..f607ff620015 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -476,10 +476,10 @@ struct ras_err_pages { }; > > struct ras_ecc_err { > - u64 hash_index; > uint64_t status; > uint64_t ipid; > uint64_t addr; > + uint64_t pa_pfn; > struct ras_err_pages err_pages; > }; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > index 5d08c03fe543..2fc90799bf8d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > @@ -523,18 +523,10 @@ int amdgpu_umc_logs_ecc_err(struct amdgpu_device > *adev, > ecc_log = &con->umc_ecc_log; > > mutex_lock(&ecc_log->lock); > - ret = radix_tree_insert(ecc_tree, ecc_err->hash_index, ecc_err); > - if (!ret) { > - struct ras_err_pages *err_pages = &ecc_err->err_pages; > - int i; > - > - /* Reserve memory */ > - for (i = 0; i < err_pages->count; i++) > - amdgpu_ras_reserve_page(adev, err_pages->pfn[i]); > - > + ret = radix_tree_insert(ecc_tree, ecc_err->pa_pfn, ecc_err); > + if (!ret) > radix_tree_tag_set(ecc_tree, > - ecc_err->hash_index, > UMC_ECC_NEW_DETECTED_TAG); > - } > + ecc_err->pa_pfn, UMC_ECC_NEW_DETECTED_TAG); > mutex_unlock(&ecc_log->lock); > > return ret; > diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c > b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c > index eca5ac6a0532..f2235c9ead29 100644 > --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c > @@ -524,9 +524,9 @@ static int umc_v12_0_update_ecc_status(struct > amdgpu_device *adev, > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > uint16_t hwid, mcatype; > uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL]; > - uint64_t err_addr, hash_val = 0, pa_addr = 0; > + uint64_t err_addr, pa_addr = 0; > struct ras_ecc_err *ecc_err; > - int count, ret; > + int count, ret, i; > > hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID); > mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType); @@ -559,39 > +559,18 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device > *adev, > if (ret) > return ret; > > - memset(page_pfn, 0, sizeof(page_pfn)); > - count = umc_v12_0_expand_addr_to_bad_pages(adev, > - pa_addr, > - page_pfn, ARRAY_SIZE(page_pfn)); > - if (count <= 0) { > - dev_warn(adev->dev, "Fail to convert error address! > count:%d\n", count); > - return 0; > - } > - > - ret = amdgpu_umc_build_pages_hash(adev, > - page_pfn, count, &hash_val); > - if (ret) { > - dev_err(adev->dev, "Fail to build error pages hash\n"); > - return ret; > - } > - > ecc_err = kzalloc(sizeof(*ecc_err), GFP_KERNEL); > if (!ecc_err) > return -ENOMEM; > > - ecc_err->err_pages.pfn = kcalloc(count, sizeof(*ecc_err->err_pages.pfn), > GFP_KERNEL); > - if (!ecc_err->err_pages.pfn) { > - kfree(ecc_err); > - return -ENOMEM; > - } > - > - memcpy(ecc_err->err_pages.pfn, page_pfn, count * sizeof(*ecc_err- > >err_pages.pfn)); > - ecc_err->err_pages.count = count; > - > - ecc_err->hash_index = hash_val; > ecc_err->status = status; > ecc_err->ipid = ipid; > ecc_err->addr = addr; > + ecc_err->pa_pfn = UMC_V12_ADDR_MASK_BAD_COLS(pa_addr) >> > +AMDGPU_GPU_PAGE_SHIFT; > + > + /* If converted pa_pfn is 0, use pa xor pfn. */ > + if (!ecc_err->pa_pfn) > + ecc_err->pa_pfn = BIT_ULL(UMC_V12_0_PA_R13_BIT) >> > +AMDGPU_GPU_PAGE_SHIFT; >[Tao] why address 0 should be avoided? [Thomas] When address is 0, it means the data has just been loaded from EEPROM, it should calculate pa_pfn. This will be useful for the eeprom new data formats in the future. > > ret = amdgpu_umc_logs_ecc_err(adev, &con- > >umc_ecc_log.de_page_tree, ecc_err); > if (ret) { > @@ -600,13 +579,25 @@ static int umc_v12_0_update_ecc_status(struct > amdgpu_device *adev, > else > dev_err(adev->dev, "Fail to log ecc error! > ret:%d\n", ret); > > - kfree(ecc_err->err_pages.pfn); > kfree(ecc_err); > return ret; > } > > con->umc_ecc_log.de_queried_count++; > > + memset(page_pfn, 0, sizeof(page_pfn)); > + count = umc_v12_0_expand_addr_to_bad_pages(adev, > + pa_addr, > + page_pfn, ARRAY_SIZE(page_pfn)); > + if (count <= 0) { > + dev_warn(adev->dev, "Fail to convert error address! > count:%d\n", count); > + return 0; > + } > + > + /* Reserve memory */ > + for (i = 0; i < count; i++) > + amdgpu_ras_reserve_page(adev, page_pfn[i]); > + > /* The problem case is as follows: > * 1. GPU A triggers a gpu ras reset, and GPU A drives > * GPU B to also perform a gpu ras reset. > @@ -631,16 +622,21 @@ static int umc_v12_0_fill_error_record(struct > amdgpu_device *adev, > struct ras_ecc_err *ecc_err, void > *ras_error_status) { > struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; > - uint32_t i = 0; > - int ret = 0; > + uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL]; > + int ret, i, count; > > if (!err_data || !ecc_err) > return -EINVAL; > > - for (i = 0; i < ecc_err->err_pages.count; i++) { > + memset(page_pfn, 0, sizeof(page_pfn)); > + count = umc_v12_0_expand_addr_to_bad_pages(adev, > + ecc_err->pa_pfn << > AMDGPU_GPU_PAGE_SHIFT, > + page_pfn, ARRAY_SIZE(page_pfn)); > + > + for (i = 0; i < count; i++) { > ret = amdgpu_umc_fill_error_record(err_data, > ecc_err->addr, > - ecc_err->err_pages.pfn[i] << > AMDGPU_GPU_PAGE_SHIFT, > + page_pfn[i] << AMDGPU_GPU_PAGE_SHIFT, > MCA_IPID_2_UMC_CH(ecc_err->ipid), > MCA_IPID_2_UMC_INST(ecc_err->ipid)); > if (ret) > @@ -674,7 +670,8 @@ static void > umc_v12_0_query_ras_ecc_err_addr(struct > amdgpu_device *adev, > dev_err(adev->dev, "Fail to fill umc error > record, ret:%d\n", ret); > break; > } > - radix_tree_tag_clear(ecc_tree, entries[i]->hash_index, > UMC_ECC_NEW_DETECTED_TAG); > + radix_tree_tag_clear(ecc_tree, > + entries[i]->pa_pfn, > UMC_ECC_NEW_DETECTED_TAG); > } > mutex_unlock(&con->umc_ecc_log.lock); > } > diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h > b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h > index b4974793850b..be5598d76c1d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h > +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h > @@ -81,6 +81,11 @@ > (((REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo) & 0x1) << 2) | \ > (REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi) & 0x03)) > > +#define UMC_V12_ADDR_MASK_BAD_COLS(addr) \ > + ((addr) & ~((0x3ULL << UMC_V12_0_PA_C2_BIT) | \ > + (0x1ULL << UMC_V12_0_PA_C4_BIT) | \ > + (0x1ULL << UMC_V12_0_PA_R13_BIT))) > + > bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t > mc_umc_status); bool umc_v12_0_is_uncorrectable_error(struct > amdgpu_device *adev, uint64_t mc_umc_status); bool > umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t > mc_umc_status); > -- > 2.34.1