Separate deferred error from UE and CE and log it individually. Signed-off-by: Candice Li <candice.li@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 11 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 116 +++++++++++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 6 + drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 1 + drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 60 ++++----- drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 3 + .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 6 +- 7 files changed, 142 insertions(+), 61 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c index 59fafb8392e0ba..666fd8fa39ad5e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c @@ -256,9 +256,14 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo if (type == AMDGPU_MCA_ERROR_TYPE_UE) amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, &err_addr, (uint64_t)count); - else - amdgpu_ras_error_statistic_ce_count(err_data, - &mcm_info, &err_addr, (uint64_t)count); + else { + if (!!(MCA_REG__STATUS__DEFERRED(entry->regs[MCA_REG_IDX_STATUS]))) + amdgpu_ras_error_statistic_de_count(err_data, + &mcm_info, &err_addr, (uint64_t)count); + else + amdgpu_ras_error_statistic_ce_count(err_data, + &mcm_info, &err_addr, (uint64_t)count); + } } out_mca_release: diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ded19182dc792a..94ba10b4184349 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1036,7 +1036,8 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, struct ras_manager *ras_mgr, struct ras_err_data *err_data, const char *blk_name, - bool is_ue) + bool is_ue, + bool is_de) { struct amdgpu_smuio_mcm_config_info *mcm_info; struct ras_err_node *err_node; @@ -1065,25 +1066,50 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, } } else { - for_each_ras_error(err_node, err_data) { - err_info = &err_node->err_info; - mcm_info = &err_info->mcm_info; - if (err_info->ce_count) { + if (is_de) { + for_each_ras_error(err_node, err_data) { + err_info = &err_node->err_info; + mcm_info = &err_info->mcm_info; + if (err_info->de_count) { + dev_info(adev->dev, "socket: %d, die: %d, " + "%lld new deferred hardware errors detected in %s block\n", + mcm_info->socket_id, + mcm_info->die_id, + err_info->de_count, + blk_name); + } + } + + for_each_ras_error(err_node, &ras_mgr->err_data) { + err_info = &err_node->err_info; + mcm_info = &err_info->mcm_info; dev_info(adev->dev, "socket: %d, die: %d, " - "%lld new correctable hardware errors detected in %s block\n", - mcm_info->socket_id, - mcm_info->die_id, - err_info->ce_count, - blk_name); + "%lld deferred hardware errors detected in total in %s block\n", + mcm_info->socket_id, mcm_info->die_id, + err_info->de_count, blk_name); + } + } else { + for_each_ras_error(err_node, err_data) { + err_info = &err_node->err_info; + mcm_info = &err_info->mcm_info; + if (err_info->ce_count) { + dev_info(adev->dev, "socket: %d, die: %d, " + "%lld new correctable hardware errors detected in %s block\n", + mcm_info->socket_id, + mcm_info->die_id, + err_info->ce_count, + blk_name); + } } - } - for_each_ras_error(err_node, &ras_mgr->err_data) { - err_info = &err_node->err_info; - mcm_info = &err_info->mcm_info; - dev_info(adev->dev, "socket: %d, die: %d, " - "%lld correctable hardware errors detected in total in %s block\n", - mcm_info->socket_id, mcm_info->die_id, err_info->ce_count, blk_name); + for_each_ras_error(err_node, &ras_mgr->err_data) { + err_info = &err_node->err_info; + mcm_info = &err_info->mcm_info; + dev_info(adev->dev, "socket: %d, die: %d, " + "%lld correctable hardware errors detected in total in %s block\n", + mcm_info->socket_id, mcm_info->die_id, + err_info->ce_count, blk_name); + } } } } @@ -1102,7 +1128,8 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, if (err_data->ce_count) { if (err_data_has_source_info(err_data)) { - amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, blk_name, false); + amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, + blk_name, false, false); } else if (!adev->aid_mask && adev->smuio.funcs && adev->smuio.funcs->get_socket_id && @@ -1124,7 +1151,8 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, if (err_data->ue_count) { if (err_data_has_source_info(err_data)) { - amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, blk_name, true); + amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, + blk_name, true, false); } else if (!adev->aid_mask && adev->smuio.funcs && adev->smuio.funcs->get_socket_id && @@ -1144,6 +1172,28 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, } } + if (err_data->de_count) { + if (err_data_has_source_info(err_data)) { + amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, + blk_name, false, true); + } else if (!adev->aid_mask && + adev->smuio.funcs && + adev->smuio.funcs->get_socket_id && + adev->smuio.funcs->get_die_id) { + dev_info(adev->dev, "socket: %d, die: %d " + "%ld deferred hardware errors " + "detected in %s block\n", + adev->smuio.funcs->get_socket_id(adev), + adev->smuio.funcs->get_die_id(adev), + ras_mgr->err_data.de_count, + blk_name); + } else { + dev_info(adev->dev, "%ld deferred hardware errors " + "detected in %s block\n", + ras_mgr->err_data.de_count, + blk_name); + } + } } static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, struct ras_err_data *err_data) @@ -1154,7 +1204,8 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s if (err_data_has_source_info(err_data)) { for_each_ras_error(err_node, err_data) { err_info = &err_node->err_info; - + amdgpu_ras_error_statistic_de_count(&obj->err_data, + &err_info->mcm_info, NULL, err_info->de_count); amdgpu_ras_error_statistic_ce_count(&obj->err_data, &err_info->mcm_info, NULL, err_info->ce_count); amdgpu_ras_error_statistic_ue_count(&obj->err_data, @@ -1164,6 +1215,7 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s /* for legacy asic path which doesn't has error source info */ obj->err_data.ue_count += err_data->ue_count; obj->err_data.ce_count += err_data->ce_count; + obj->err_data.de_count += err_data->de_count; } } @@ -1239,6 +1291,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i info->ue_count = obj->err_data.ue_count; info->ce_count = obj->err_data.ce_count; + info->de_count = obj->err_data.de_count; amdgpu_ras_error_generate_report(adev, info, &err_data); @@ -1951,6 +2004,7 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, */ obj->err_data.ue_count += err_data.ue_count; obj->err_data.ce_count += err_data.ce_count; + obj->err_data.de_count += err_data.de_count; } amdgpu_ras_error_data_fini(&err_data); @@ -3782,6 +3836,28 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, return 0; } +int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data, + struct amdgpu_smuio_mcm_config_info *mcm_info, + struct ras_err_addr *err_addr, u64 count) +{ + struct ras_err_info *err_info; + + if (!err_data || !mcm_info) + return -EINVAL; + + if (!count) + return 0; + + err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr); + if (!err_info) + return -EINVAL; + + err_info->de_count += count; + err_data->de_count += count; + + return 0; +} + #define mmMP0_SMN_C2PMSG_92 0x1609C #define mmMP0_SMN_C2PMSG_126 0x160BE static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 9edc3909f17d7a..fae53d87578bd7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -476,6 +476,7 @@ struct ras_err_info { struct amdgpu_smuio_mcm_config_info mcm_info; u64 ce_count; u64 ue_count; + u64 de_count; struct ras_err_addr err_addr; }; @@ -487,6 +488,7 @@ struct ras_err_node { struct ras_err_data { unsigned long ue_count; unsigned long ce_count; + unsigned long de_count; unsigned long err_addr_cnt; struct eeprom_table_record *err_addr; u32 err_list_count; @@ -562,6 +564,7 @@ struct ras_query_if { struct ras_common_if head; unsigned long ue_count; unsigned long ce_count; + unsigned long de_count; }; struct ras_inject_if { @@ -833,5 +836,8 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, struct amdgpu_smuio_mcm_config_info *mcm_info, struct ras_err_addr *err_addr, u64 count); +int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data, + struct amdgpu_smuio_mcm_config_info *mcm_info, + struct ras_err_addr *err_addr, u64 count); void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index d65e21914d8c4a..848df7acdd3210 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -210,6 +210,7 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset) if (ret == AMDGPU_RAS_SUCCESS && obj) { obj->err_data.ue_count += err_data.ue_count; obj->err_data.ce_count += err_data.ce_count; + obj->err_data.de_count += err_data.de_count; } amdgpu_ras_error_data_fini(&err_data); diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 7458a218e89db1..7cabdcaf0f7e3d 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -89,12 +89,17 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device *adev) umc_v12_0_reset_error_count_per_channel, NULL); } +bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status) +{ + return (amdgpu_ras_is_poison_mode_supported(adev) && + (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && + (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1)); +} + bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status) { - if (amdgpu_ras_is_poison_mode_supported(adev) && - (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && - (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1)) - return true; + if (umc_v12_0_is_deferred_error(adev, mc_umc_status)) + return false; return ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 || @@ -104,9 +109,7 @@ bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_um bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status) { - if (amdgpu_ras_is_poison_mode_supported(adev) && - (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && - (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1)) + if (umc_v12_0_is_deferred_error(adev, mc_umc_status)) return false; return (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && @@ -119,29 +122,10 @@ bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_ !(umc_v12_0_is_uncorrectable_error(adev, mc_umc_status))))); } -static void umc_v12_0_query_correctable_error_count(struct amdgpu_device *adev, +static void umc_v12_0_query_error_count_per_type(struct amdgpu_device *adev, uint64_t umc_reg_offset, - unsigned long *error_count) -{ - uint64_t mc_umc_status; - uint64_t mc_umc_status_addr; - - mc_umc_status_addr = - SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); - - /* Rely on MCUMC_STATUS for correctable error counter - * MCUMC_STATUS is a 64 bit register - */ - mc_umc_status = - RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4); - - if (umc_v12_0_is_correctable_error(adev, mc_umc_status)) - *error_count += 1; -} - -static void umc_v12_0_query_uncorrectable_error_count(struct amdgpu_device *adev, - uint64_t umc_reg_offset, - unsigned long *error_count) + unsigned long *error_count, + check_error_type_func error_type_func) { uint64_t mc_umc_status; uint64_t mc_umc_status_addr; @@ -149,11 +133,11 @@ static void umc_v12_0_query_uncorrectable_error_count(struct amdgpu_device *adev mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); - /* Check the MCUMC_STATUS. */ + /* Check MCUMC_STATUS */ mc_umc_status = RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4); - if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status)) + if (error_type_func(adev, mc_umc_status)) *error_count += 1; } @@ -162,7 +146,7 @@ static int umc_v12_0_query_error_count(struct amdgpu_device *adev, uint32_t ch_inst, void *data) { struct ras_err_data *err_data = (struct ras_err_data *)data; - unsigned long ue_count = 0, ce_count = 0; + unsigned long ue_count = 0, ce_count = 0, de_count = 0; /* NOTE: node_inst is converted by adev->umc.active_mask and the range is [0-3], * which can be used as die ID directly */ @@ -174,11 +158,16 @@ static int umc_v12_0_query_error_count(struct amdgpu_device *adev, uint64_t umc_reg_offset = get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst); - umc_v12_0_query_correctable_error_count(adev, umc_reg_offset, &ce_count); - umc_v12_0_query_uncorrectable_error_count(adev, umc_reg_offset, &ue_count); + umc_v12_0_query_error_count_per_type(adev, umc_reg_offset, + &ce_count, umc_v12_0_is_correctable_error); + umc_v12_0_query_error_count_per_type(adev, umc_reg_offset, + &ue_count, umc_v12_0_is_uncorrectable_error); + umc_v12_0_query_error_count_per_type(adev, umc_reg_offset, + &de_count, umc_v12_0_is_deferred_error); amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, ue_count); amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, ce_count); + amdgpu_ras_error_statistic_de_count(err_data, &mcm_info, NULL, de_count); return 0; } @@ -392,7 +381,8 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade if (!mc_umc_status) continue; - if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status)) { + if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) || + umc_v12_0_is_deferred_error(adev, mc_umc_status)) { uint64_t mca_addr, err_addr, mca_ipid; uint32_t InstanceIdLo; struct amdgpu_smuio_mcm_config_info *mcm_info; diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h index e8de3a92251a2c..5973bfb14fceec 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h @@ -121,9 +121,12 @@ (((_ipid_lo) >> 12) & 0xF)) #define MCA_IPID_LO_2_UMC_INST(_ipid_lo) (((_ipid_lo) >> 21) & 0x7) +bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status); bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status); bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status); +typedef bool (*check_error_type_func)(struct amdgpu_device *adev, uint64_t mc_umc_status); + extern const uint32_t umc_v12_0_channel_idx_tbl[] [UMC_V12_0_UMC_INSTANCE_NUM] diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 4ebc6b421c2cb4..f13f957c8ac195 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -2553,9 +2553,9 @@ static int mca_umc_mca_get_err_count(const struct mca_ras_info *mca_ras, struct return 0; } - if (type == AMDGPU_MCA_ERROR_TYPE_UE && umc_v12_0_is_uncorrectable_error(adev, status0)) - *count = 1; - else if (type == AMDGPU_MCA_ERROR_TYPE_CE && umc_v12_0_is_correctable_error(adev, status0)) + if ((type == AMDGPU_MCA_ERROR_TYPE_UE && umc_v12_0_is_uncorrectable_error(adev, status0)) || + (type == AMDGPU_MCA_ERROR_TYPE_CE && (umc_v12_0_is_correctable_error(adev, status0) || + umc_v12_0_is_deferred_error(adev, status0)))) *count = 1; return 0; -- 2.25.1