[AMD Official Use Only - General] Series is Reviewed-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> Regards, Hawking -----Original Message----- From: Chai, Thomas <YiPeng.Chai@xxxxxxx> Sent: Thursday, January 18, 2024 14:43 To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Chai, Thomas <YiPeng.Chai@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx> Subject: [PATCH V2 5/5] drm/amdgpu:Support retiring multiple MCA error address pages Support retiring multiple MCA error address pages in one in-band query for umc v12_0. Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 +++++++++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 8 ++- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 66 +++++++++++++------------ 3 files changed, 77 insertions(+), 40 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 61a02dbac087..879e1e59ac76 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3909,8 +3909,7 @@ static int ras_err_info_cmp(void *priv, struct list_head *a, struct list_head *b } static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data, - struct amdgpu_smuio_mcm_config_info *mcm_info, - struct ras_err_addr *err_addr) + struct amdgpu_smuio_mcm_config_info *mcm_info) { struct ras_err_node *err_node; @@ -3922,10 +3921,9 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d if (!err_node) return NULL; - memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info)); + INIT_LIST_HEAD(&err_node->err_info.err_addr_list); - if (err_addr) - memcpy(&err_node->err_info.err_addr, err_addr, sizeof(*err_addr)); + memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info)); err_data->err_list_count++; list_add_tail(&err_node->node, &err_data->err_node_list); @@ -3934,6 +3932,29 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d return &err_node->err_info; } +void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, struct +ras_err_addr *err_addr) { + struct ras_err_addr *mca_err_addr; + + mca_err_addr = kzalloc(sizeof(*mca_err_addr), GFP_KERNEL); + if (!mca_err_addr) + return; + + INIT_LIST_HEAD(&mca_err_addr->node); + + mca_err_addr->err_status = err_addr->err_status; + mca_err_addr->err_ipid = err_addr->err_ipid; + mca_err_addr->err_addr = err_addr->err_addr; + + list_add_tail(&mca_err_addr->node, &err_info->err_addr_list); } + +void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, struct +ras_err_addr *mca_err_addr) { + list_del(&mca_err_addr->node); + kfree(mca_err_addr); +} + int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, struct amdgpu_smuio_mcm_config_info *mcm_info, struct ras_err_addr *err_addr, u64 count) @@ -3946,10 +3967,13 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, if (!count) return 0; - err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr); + err_info = amdgpu_ras_error_get_info(err_data, mcm_info); if (!err_info) return -EINVAL; + if (err_addr && err_addr->err_status) + amdgpu_ras_add_mca_err_addr(err_info, err_addr); + err_info->ue_count += count; err_data->ue_count += count; @@ -3968,7 +3992,7 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, if (!count) return 0; - err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr); + err_info = amdgpu_ras_error_get_info(err_data, mcm_info); if (!err_info) return -EINVAL; @@ -3990,10 +4014,13 @@ int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data, if (!count) return 0; - err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr); + err_info = amdgpu_ras_error_get_info(err_data, mcm_info); if (!err_info) return -EINVAL; + if (err_addr && err_addr->err_status) + amdgpu_ras_add_mca_err_addr(err_info, err_addr); + err_info->de_count += count; err_data->de_count += count; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 9c3df9985fad..a25aea6ae230 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -474,6 +474,7 @@ struct ras_fs_data { }; struct ras_err_addr { + struct list_head node; uint64_t err_status; uint64_t err_ipid; uint64_t err_addr; @@ -484,7 +485,7 @@ struct ras_err_info { u64 ce_count; u64 ue_count; u64 de_count; - struct ras_err_addr err_addr; + struct list_head err_addr_list; }; struct ras_err_node { @@ -856,4 +857,9 @@ int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk) ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr, struct aca_handle *handle, char *buf, void *data); +void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, + struct ras_err_addr *err_addr); + +void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, + struct ras_err_addr *mca_err_addr); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 1e8e97d72f1e..f9dc1855ac4a 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -385,42 +385,46 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade { struct ras_err_node *err_node; uint64_t mc_umc_status; + struct ras_err_info *err_info; + struct ras_err_addr *mca_err_addr, *tmp; struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; for_each_ras_error(err_node, err_data) { - mc_umc_status = err_node->err_info.err_addr.err_status; - if (!mc_umc_status) + err_info = &err_node->err_info; + if (list_empty(&err_info->err_addr_list)) continue; - if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) || - umc_v12_0_is_deferred_error(adev, mc_umc_status)) { - uint64_t mca_addr, err_addr, mca_ipid; - uint32_t InstanceIdLo; - struct amdgpu_smuio_mcm_config_info *mcm_info; - - mcm_info = &err_node->err_info.mcm_info; - mca_addr = err_node->err_info.err_addr.err_addr; - mca_ipid = err_node->err_info.err_addr.err_ipid; - - err_addr = REG_GET_FIELD(mca_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); - InstanceIdLo = REG_GET_FIELD(mca_ipid, MCMP1_IPIDT0, InstanceIdLo); - - dev_info(adev->dev, "UMC:IPID:0x%llx, aid:%d, inst:%d, ch:%d, err_addr:0x%llx\n", - mca_ipid, - mcm_info->die_id, - MCA_IPID_LO_2_UMC_INST(InstanceIdLo), - MCA_IPID_LO_2_UMC_CH(InstanceIdLo), - err_addr); - - umc_v12_0_convert_error_address(adev, - err_data, err_addr, - MCA_IPID_LO_2_UMC_CH(InstanceIdLo), - MCA_IPID_LO_2_UMC_INST(InstanceIdLo), - mcm_info->die_id); - - /* Clear umc error address content */ - memset(&err_node->err_info.err_addr, - 0, sizeof(err_node->err_info.err_addr)); + list_for_each_entry_safe(mca_err_addr, tmp, &err_info->err_addr_list, node) { + mc_umc_status = mca_err_addr->err_status; + if (mc_umc_status && + (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) || + umc_v12_0_is_deferred_error(adev, mc_umc_status))) { + uint64_t mca_addr, err_addr, mca_ipid; + uint32_t InstanceIdLo; + + mca_addr = mca_err_addr->err_addr; + mca_ipid = mca_err_addr->err_ipid; + + err_addr = REG_GET_FIELD(mca_addr, + MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); + InstanceIdLo = REG_GET_FIELD(mca_ipid, MCMP1_IPIDT0, InstanceIdLo); + + dev_info(adev->dev, "UMC:IPID:0x%llx, aid:%d, inst:%d, ch:%d, err_addr:0x%llx\n", + mca_ipid, + err_info->mcm_info.die_id, + MCA_IPID_LO_2_UMC_INST(InstanceIdLo), + MCA_IPID_LO_2_UMC_CH(InstanceIdLo), + err_addr); + + umc_v12_0_convert_error_address(adev, + err_data, err_addr, + MCA_IPID_LO_2_UMC_CH(InstanceIdLo), + MCA_IPID_LO_2_UMC_INST(InstanceIdLo), + err_info->mcm_info.die_id); + } + + /* Delete error address node from list and free memory */ + amdgpu_ras_del_mca_err_addr(err_info, mca_err_addr); } } } -- 2.34.1