[AMD Official Use Only] > -----Original Message----- > From: Yang, Stanley <Stanley.Yang@xxxxxxx> > Sent: Tuesday, December 21, 2021 2:05 PM > To: Zhou1, Tao <Tao.Zhou1@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, > Hawking <Hawking.Zhang@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx> > Subject: 回复: [PATCH] drm/amdgpu: save error count in RAS poison handler > > [AMD Official Use Only] > > > +void amdgpu_umc_ras_fini(struct amdgpu_device *adev) { > > + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) > > && > > + adev->umc.ras_if) { > > + struct ras_common_if *ras_if = adev->umc.ras_if; > > + struct ras_ih_if ih_info = { > > + .head = *ras_if, > > + .cb = amdgpu_umc_process_ras_data_cb, > > + }; > > + > > + amdgpu_ras_late_fini(adev, ras_if, &ih_info); > > + kfree(ras_if); > > + } > > +} > > + > > + > > + > [Yang, Stanley] it's better remove extra blank lines. > > int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev, > > struct amdgpu_irq_src *source, > > struct amdgpu_iv_entry *entry) > > Other than above, patch is reviewed-by: Stanley.Yang <Stanley.Yang@xxxxxxx> > > > -----邮件原件----- > > 发件人: Zhou1, Tao <Tao.Zhou1@xxxxxxx> > > 发送时间: Monday, December 20, 2021 4:51 PM > > 收件人: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking > > <Hawking.Zhang@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>; Chai, > > Thomas <YiPeng.Chai@xxxxxxx> > > 抄送: Zhou1, Tao <Tao.Zhou1@xxxxxxx> > > 主题: [PATCH] drm/amdgpu: save error count in RAS poison handler > > > > Otherwise the RAS error count couldn't be queried from sysfs. > > > > Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +- > > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 170 ++++++++++++------ > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 3 +- > > 3 files changed, 99 insertions(+), 76 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > > index 0bf09a94d944..776a947b45df 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > > @@ -727,7 +727,7 @@ void > > amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device > > *adev, bo > > > > /* CPU MCA will handle page retirement if connected_to_cpu is 1 */ > > if (!adev->gmc.xgmi.connected_to_cpu) > > - amdgpu_umc_do_page_retirement(adev, &err_data, NULL, > > reset); > > + amdgpu_umc_poison_handler(adev, &err_data, reset); > > else if (reset) > > amdgpu_amdkfd_gpu_reset(adev); > > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > > index 0c33f367a4e5..1c2dbd00f647 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > > @@ -23,79 +23,7 @@ > > > > #include "amdgpu_ras.h" > > > > -static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, > > - void *ras_error_status, > > - struct amdgpu_iv_entry *entry) > > -{ > > - return amdgpu_umc_do_page_retirement(adev, ras_error_status, > > entry, true); > > -} > > - > > -int amdgpu_umc_ras_late_init(struct amdgpu_device *adev) -{ > > - int r; > > - struct ras_fs_if fs_info = { > > - .sysfs_name = "umc_err_count", > > - }; > > - struct ras_ih_if ih_info = { > > - .cb = amdgpu_umc_process_ras_data_cb, > > - }; > > - > > - if (!adev->umc.ras_if) { > > - adev->umc.ras_if = > > - kmalloc(sizeof(struct ras_common_if), GFP_KERNEL); > > - if (!adev->umc.ras_if) > > - return -ENOMEM; > > - adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC; > > - adev->umc.ras_if->type = > > AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; > > - adev->umc.ras_if->sub_block_index = 0; > > - } > > - ih_info.head = fs_info.head = *adev->umc.ras_if; > > - > > - r = amdgpu_ras_late_init(adev, adev->umc.ras_if, > > - &fs_info, &ih_info); > > - if (r) > > - goto free; > > - > > - if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) { > > - r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0); > > - if (r) > > - goto late_fini; > > - } else { > > - r = 0; > > - goto free; > > - } > > - > > - /* ras init of specific umc version */ > > - if (adev->umc.ras_funcs && > > - adev->umc.ras_funcs->err_cnt_init) > > - adev->umc.ras_funcs->err_cnt_init(adev); > > - > > - return 0; > > - > > -late_fini: > > - amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info); > > -free: > > - kfree(adev->umc.ras_if); > > - adev->umc.ras_if = NULL; > > - return r; > > -} > > - > > -void amdgpu_umc_ras_fini(struct amdgpu_device *adev) -{ > > - if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) > > && > > - adev->umc.ras_if) { > > - struct ras_common_if *ras_if = adev->umc.ras_if; > > - struct ras_ih_if ih_info = { > > - .head = *ras_if, > > - .cb = amdgpu_umc_process_ras_data_cb, > > - }; > > - > > - amdgpu_ras_late_fini(adev, ras_if, &ih_info); > > - kfree(ras_if); > > - } > > -} > > - > > -int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, > > +static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, > > void *ras_error_status, > > struct amdgpu_iv_entry *entry, > > bool reset) > > @@ -180,6 +108,102 @@ int amdgpu_umc_do_page_retirement(struct > > amdgpu_device *adev, > > return AMDGPU_RAS_SUCCESS; > > } > > > > +int amdgpu_umc_poison_handler(struct amdgpu_device *adev, > > + void *ras_error_status, > > + bool reset) > > +{ > > + int ret; > > + struct ras_err_data *err_data = (struct ras_err_data > > *)ras_error_status; > > + struct ras_common_if head = { > > + .block = AMDGPU_RAS_BLOCK__UMC, > > + }; > > + struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); > > + > > + ret = > > + amdgpu_umc_do_page_retirement(adev, ras_error_status, > > NULL, reset); > > + > > + if (ret == AMDGPU_RAS_SUCCESS && obj) { > > + obj->err_data.ue_count += err_data->ue_count; > > + obj->err_data.ce_count += err_data->ce_count; > > + } > > + > > + return ret; > > +} > > + > > +static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, > > + void *ras_error_status, > > + struct amdgpu_iv_entry *entry) > > +{ > > + return amdgpu_umc_do_page_retirement(adev, ras_error_status, > > entry, > > +true); } > > + > > +int amdgpu_umc_ras_late_init(struct amdgpu_device *adev) { > > + int r; > > + struct ras_fs_if fs_info = { > > + .sysfs_name = "umc_err_count", > > + }; > > + struct ras_ih_if ih_info = { > > + .cb = amdgpu_umc_process_ras_data_cb, > > + }; > > + > > + if (!adev->umc.ras_if) { > > + adev->umc.ras_if = > > + kmalloc(sizeof(struct ras_common_if), GFP_KERNEL); > > + if (!adev->umc.ras_if) > > + return -ENOMEM; > > + adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC; > > + adev->umc.ras_if->type = > > AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; > > + adev->umc.ras_if->sub_block_index = 0; > > + } > > + ih_info.head = fs_info.head = *adev->umc.ras_if; > > + > > + r = amdgpu_ras_late_init(adev, adev->umc.ras_if, > > + &fs_info, &ih_info); > > + if (r) > > + goto free; > > + > > + if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) { > > + r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0); > > + if (r) > > + goto late_fini; > > + } else { > > + r = 0; > > + goto free; > > + } > > + > > + /* ras init of specific umc version */ > > + if (adev->umc.ras_funcs && > > + adev->umc.ras_funcs->err_cnt_init) > > + adev->umc.ras_funcs->err_cnt_init(adev); > > + > > + return 0; > > + > > +late_fini: > > + amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info); > > +free: > > + kfree(adev->umc.ras_if); > > + adev->umc.ras_if = NULL; > > + return r; > > +} > > + > > +void amdgpu_umc_ras_fini(struct amdgpu_device *adev) { > > + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) > > && > > + adev->umc.ras_if) { > > + struct ras_common_if *ras_if = adev->umc.ras_if; > > + struct ras_ih_if ih_info = { > > + .head = *ras_if, > > + .cb = amdgpu_umc_process_ras_data_cb, > > + }; > > + > > + amdgpu_ras_late_fini(adev, ras_if, &ih_info); > > + kfree(ras_if); > > + } > > +} > > + > > + > > + > [Yang, Stanley] it's better remove extra blank lines. [Tao] Thanks for your reminder, I'll remove them before push. > > > int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev, > > struct amdgpu_irq_src *source, > > struct amdgpu_iv_entry *entry) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > > index 8d18d5121f66..b72194e8bfe5 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > > @@ -78,9 +78,8 @@ struct amdgpu_umc { > > > > int amdgpu_umc_ras_late_init(struct amdgpu_device *adev); void > > amdgpu_umc_ras_fini(struct amdgpu_device *adev); -int > > amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, > > +int amdgpu_umc_poison_handler(struct amdgpu_device *adev, > > void *ras_error_status, > > - struct amdgpu_iv_entry *entry, > > bool reset); > > int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev, > > struct amdgpu_irq_src *source, > > -- > > 2.17.1