[AMD Official Use Only] > +void amdgpu_umc_ras_fini(struct amdgpu_device *adev) { > + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) > && > + adev->umc.ras_if) { > + struct ras_common_if *ras_if = adev->umc.ras_if; > + struct ras_ih_if ih_info = { > + .head = *ras_if, > + .cb = amdgpu_umc_process_ras_data_cb, > + }; > + > + amdgpu_ras_late_fini(adev, ras_if, &ih_info); > + kfree(ras_if); > + } > +} > + > + > + [Yang, Stanley] it's better remove extra blank lines. > int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev, > struct amdgpu_irq_src *source, > struct amdgpu_iv_entry *entry) Other than above, patch is reviewed-by: Stanley.Yang <Stanley.Yang@xxxxxxx> > -----邮件原件----- > 发件人: Zhou1, Tao <Tao.Zhou1@xxxxxxx> > 发送时间: Monday, December 20, 2021 4:51 PM > 收件人: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking > <Hawking.Zhang@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>; Chai, > Thomas <YiPeng.Chai@xxxxxxx> > 抄送: Zhou1, Tao <Tao.Zhou1@xxxxxxx> > 主题: [PATCH] drm/amdgpu: save error count in RAS poison handler > > Otherwise the RAS error count couldn't be queried from sysfs. > > Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 170 ++++++++++++------ > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 3 +- > 3 files changed, 99 insertions(+), 76 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > index 0bf09a94d944..776a947b45df 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > @@ -727,7 +727,7 @@ void > amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device > *adev, bo > > /* CPU MCA will handle page retirement if connected_to_cpu is 1 */ > if (!adev->gmc.xgmi.connected_to_cpu) > - amdgpu_umc_do_page_retirement(adev, &err_data, NULL, > reset); > + amdgpu_umc_poison_handler(adev, &err_data, reset); > else if (reset) > amdgpu_amdkfd_gpu_reset(adev); > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > index 0c33f367a4e5..1c2dbd00f647 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > @@ -23,79 +23,7 @@ > > #include "amdgpu_ras.h" > > -static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, > - void *ras_error_status, > - struct amdgpu_iv_entry *entry) > -{ > - return amdgpu_umc_do_page_retirement(adev, ras_error_status, > entry, true); > -} > - > -int amdgpu_umc_ras_late_init(struct amdgpu_device *adev) -{ > - int r; > - struct ras_fs_if fs_info = { > - .sysfs_name = "umc_err_count", > - }; > - struct ras_ih_if ih_info = { > - .cb = amdgpu_umc_process_ras_data_cb, > - }; > - > - if (!adev->umc.ras_if) { > - adev->umc.ras_if = > - kmalloc(sizeof(struct ras_common_if), GFP_KERNEL); > - if (!adev->umc.ras_if) > - return -ENOMEM; > - adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC; > - adev->umc.ras_if->type = > AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; > - adev->umc.ras_if->sub_block_index = 0; > - } > - ih_info.head = fs_info.head = *adev->umc.ras_if; > - > - r = amdgpu_ras_late_init(adev, adev->umc.ras_if, > - &fs_info, &ih_info); > - if (r) > - goto free; > - > - if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) { > - r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0); > - if (r) > - goto late_fini; > - } else { > - r = 0; > - goto free; > - } > - > - /* ras init of specific umc version */ > - if (adev->umc.ras_funcs && > - adev->umc.ras_funcs->err_cnt_init) > - adev->umc.ras_funcs->err_cnt_init(adev); > - > - return 0; > - > -late_fini: > - amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info); > -free: > - kfree(adev->umc.ras_if); > - adev->umc.ras_if = NULL; > - return r; > -} > - > -void amdgpu_umc_ras_fini(struct amdgpu_device *adev) -{ > - if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) > && > - adev->umc.ras_if) { > - struct ras_common_if *ras_if = adev->umc.ras_if; > - struct ras_ih_if ih_info = { > - .head = *ras_if, > - .cb = amdgpu_umc_process_ras_data_cb, > - }; > - > - amdgpu_ras_late_fini(adev, ras_if, &ih_info); > - kfree(ras_if); > - } > -} > - > -int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, > +static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, > void *ras_error_status, > struct amdgpu_iv_entry *entry, > bool reset) > @@ -180,6 +108,102 @@ int amdgpu_umc_do_page_retirement(struct > amdgpu_device *adev, > return AMDGPU_RAS_SUCCESS; > } > > +int amdgpu_umc_poison_handler(struct amdgpu_device *adev, > + void *ras_error_status, > + bool reset) > +{ > + int ret; > + struct ras_err_data *err_data = (struct ras_err_data > *)ras_error_status; > + struct ras_common_if head = { > + .block = AMDGPU_RAS_BLOCK__UMC, > + }; > + struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); > + > + ret = > + amdgpu_umc_do_page_retirement(adev, ras_error_status, > NULL, reset); > + > + if (ret == AMDGPU_RAS_SUCCESS && obj) { > + obj->err_data.ue_count += err_data->ue_count; > + obj->err_data.ce_count += err_data->ce_count; > + } > + > + return ret; > +} > + > +static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, > + void *ras_error_status, > + struct amdgpu_iv_entry *entry) > +{ > + return amdgpu_umc_do_page_retirement(adev, ras_error_status, > entry, > +true); } > + > +int amdgpu_umc_ras_late_init(struct amdgpu_device *adev) { > + int r; > + struct ras_fs_if fs_info = { > + .sysfs_name = "umc_err_count", > + }; > + struct ras_ih_if ih_info = { > + .cb = amdgpu_umc_process_ras_data_cb, > + }; > + > + if (!adev->umc.ras_if) { > + adev->umc.ras_if = > + kmalloc(sizeof(struct ras_common_if), GFP_KERNEL); > + if (!adev->umc.ras_if) > + return -ENOMEM; > + adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC; > + adev->umc.ras_if->type = > AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; > + adev->umc.ras_if->sub_block_index = 0; > + } > + ih_info.head = fs_info.head = *adev->umc.ras_if; > + > + r = amdgpu_ras_late_init(adev, adev->umc.ras_if, > + &fs_info, &ih_info); > + if (r) > + goto free; > + > + if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) { > + r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0); > + if (r) > + goto late_fini; > + } else { > + r = 0; > + goto free; > + } > + > + /* ras init of specific umc version */ > + if (adev->umc.ras_funcs && > + adev->umc.ras_funcs->err_cnt_init) > + adev->umc.ras_funcs->err_cnt_init(adev); > + > + return 0; > + > +late_fini: > + amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info); > +free: > + kfree(adev->umc.ras_if); > + adev->umc.ras_if = NULL; > + return r; > +} > + > +void amdgpu_umc_ras_fini(struct amdgpu_device *adev) { > + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) > && > + adev->umc.ras_if) { > + struct ras_common_if *ras_if = adev->umc.ras_if; > + struct ras_ih_if ih_info = { > + .head = *ras_if, > + .cb = amdgpu_umc_process_ras_data_cb, > + }; > + > + amdgpu_ras_late_fini(adev, ras_if, &ih_info); > + kfree(ras_if); > + } > +} > + > + > + [Yang, Stanley] it's better remove extra blank lines. > int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev, > struct amdgpu_irq_src *source, > struct amdgpu_iv_entry *entry) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > index 8d18d5121f66..b72194e8bfe5 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > @@ -78,9 +78,8 @@ struct amdgpu_umc { > > int amdgpu_umc_ras_late_init(struct amdgpu_device *adev); void > amdgpu_umc_ras_fini(struct amdgpu_device *adev); -int > amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, > +int amdgpu_umc_poison_handler(struct amdgpu_device *adev, > void *ras_error_status, > - struct amdgpu_iv_entry *entry, > bool reset); > int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev, > struct amdgpu_irq_src *source, > -- > 2.17.1