[AMD Official Use Only - General] + dev_info(adev->dev, "Total %ld uncorrectable hardware errors detected in %s block\n", + ras_mgr->err_data.ue_count, blk_name); Please remove "Total" from the kernel message. With that addressed, the series is Reviewed-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> Regards, Hawking -----Original Message----- From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Yang Wang Sent: Thursday, October 12, 2023 21:14 To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx> Subject: [PATCH 2/6] drm/amdgpu: add ras_err_info to identify RAS error source introduced "ras_err_info" to better identify a RAS ERROR source. NOTE: For legacy chips, keep the original RAS error print format. v1: RAS errors may come from different dies during a RAS error query, therefore, need a new data structure to identify the source of RAS ERROR. v2: - use new data structure 'amdgpu_smuio_mcm_config_info' instead of ras_err_id (in v1 patch) - refine ras error dump function name - refine ras error dump log format Signed-off-by: Yang Wang <kevinyang.wang@xxxxxxx> Reviewed-by: Tao Zhou <tao.zhou1@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 289 ++++++++++++++++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 28 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_smuio.h | 5 + drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 27 +- drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 7 +- drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 7 +- 6 files changed, 310 insertions(+), 53 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 5fb57419ef77..2522d29806dc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -152,8 +152,9 @@ static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev) static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address) { - struct ras_err_data err_data = {0, 0, 0, NULL}; + struct ras_err_data err_data; struct eeprom_table_record err_rec; + int ret; if ((address >= adev->gmc.mc_vram_size) || (address >= RAS_UMC_INJECT_ADDR_LIMIT)) { @@ -170,6 +171,10 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre return 0; } + ret = amdgpu_ras_error_data_init(&err_data); + if (ret) + return ret; + memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); err_data.err_addr = &err_rec; amdgpu_umc_fill_error_record(&err_data, address, address, 0, 0); @@ -180,6 +185,8 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre amdgpu_ras_save_bad_pages(adev, NULL); } + amdgpu_ras_error_data_fini(&err_data); + dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n"); dev_warn(adev->dev, "Clear EEPROM:\n"); dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n"); @@ -1015,17 +1022,113 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d } } +static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, struct ras_query_if *query_if, + struct ras_err_data *err_data, bool is_ue) { + struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head); + const char *blk_name = get_ras_block_str(&query_if->head); + struct amdgpu_smuio_mcm_config_info *mcm_info; + struct ras_err_node *err_node; + struct ras_err_info *err_info; + + if (is_ue) + dev_info(adev->dev, "Total %ld uncorrectable hardware errors detected in %s block\n", + ras_mgr->err_data.ue_count, blk_name); + else + dev_info(adev->dev, "Total %ld correctable hardware errors detected in %s block\n", + ras_mgr->err_data.ue_count, blk_name); + + for_each_ras_error(err_node, err_data) { + err_info = &err_node->err_info; + mcm_info = &err_info->mcm_info; + if (is_ue && err_info->ue_count) { + dev_info(adev->dev, "socket: %d, die: %d, " + "%lld uncorrectable hardware errors detected in %s block\n", + mcm_info->socket_id, + mcm_info->die_id, + err_info->ue_count, + blk_name); + } else if (!is_ue && err_info->ce_count) { + dev_info(adev->dev, "socket: %d, die: %d, " + "%lld correctable hardware errors detected in %s block\n", + mcm_info->socket_id, + mcm_info->die_id, + err_info->ue_count, + blk_name); + } + } +} + +static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, struct ras_query_if *query_if, + struct ras_err_data *err_data) { + struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head); + const char *blk_name = get_ras_block_str(&query_if->head); + + if (err_data->ce_count) { + if (!list_empty(&err_data->err_node_list)) { + amdgpu_ras_error_print_error_data(adev, query_if, err_data, false); + } else if (!adev->aid_mask && + adev->smuio.funcs && + adev->smuio.funcs->get_socket_id && + adev->smuio.funcs->get_die_id) { + dev_info(adev->dev, "socket: %d, die: %d " + "%ld correctable hardware errors " + "detected in %s block, no user " + "action is needed.\n", + adev->smuio.funcs->get_socket_id(adev), + adev->smuio.funcs->get_die_id(adev), + ras_mgr->err_data.ce_count, + blk_name); + } else { + dev_info(adev->dev, "%ld correctable hardware errors " + "detected in %s block, no user " + "action is needed.\n", + ras_mgr->err_data.ce_count, + blk_name); + } + } + + if (err_data->ue_count) { + if (!list_empty(&err_data->err_node_list)) { + amdgpu_ras_error_print_error_data(adev, query_if, err_data, true); + } else if (!adev->aid_mask && + adev->smuio.funcs && + adev->smuio.funcs->get_socket_id && + adev->smuio.funcs->get_die_id) { + dev_info(adev->dev, "socket: %d, die: %d " + "%ld uncorrectable hardware errors " + "detected in %s block\n", + adev->smuio.funcs->get_socket_id(adev), + adev->smuio.funcs->get_die_id(adev), + ras_mgr->err_data.ue_count, + blk_name); + } else { + dev_info(adev->dev, "%ld uncorrectable hardware errors " + "detected in %s block\n", + ras_mgr->err_data.ue_count, + blk_name); + } + } + +} + /* query/inject/cure begin */ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info) { struct amdgpu_ras_block_object *block_obj = NULL; struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); - struct ras_err_data err_data = {0, 0, 0, NULL}; + struct ras_err_data err_data; + int ret; if (!obj) return -EINVAL; + ret = amdgpu_ras_error_data_init(&err_data); + if (ret) + return ret; + if (info->head.block == AMDGPU_RAS_BLOCK__UMC) { amdgpu_ras_get_ecc_info(adev, &err_data); } else { @@ -1033,7 +1136,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, if (!block_obj || !block_obj->hw_ops) { dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", get_ras_block_str(&info->head)); - return -EINVAL; + ret = -EINVAL; + goto out_fini_err_data; } if (block_obj->hw_ops->query_ras_error_count) @@ -1053,48 +1157,12 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, info->ue_count = obj->err_data.ue_count; info->ce_count = obj->err_data.ce_count; - if (err_data.ce_count) { - if (!adev->aid_mask && - adev->smuio.funcs && - adev->smuio.funcs->get_socket_id && - adev->smuio.funcs->get_die_id) { - dev_info(adev->dev, "socket: %d, die: %d " - "%ld correctable hardware errors " - "detected in %s block, no user " - "action is needed.\n", - adev->smuio.funcs->get_socket_id(adev), - adev->smuio.funcs->get_die_id(adev), - obj->err_data.ce_count, - get_ras_block_str(&info->head)); - } else { - dev_info(adev->dev, "%ld correctable hardware errors " - "detected in %s block, no user " - "action is needed.\n", - obj->err_data.ce_count, - get_ras_block_str(&info->head)); - } - } - if (err_data.ue_count) { - if (!adev->aid_mask && - adev->smuio.funcs && - adev->smuio.funcs->get_socket_id && - adev->smuio.funcs->get_die_id) { - dev_info(adev->dev, "socket: %d, die: %d " - "%ld uncorrectable hardware errors " - "detected in %s block\n", - adev->smuio.funcs->get_socket_id(adev), - adev->smuio.funcs->get_die_id(adev), - obj->err_data.ue_count, - get_ras_block_str(&info->head)); - } else { - dev_info(adev->dev, "%ld uncorrectable hardware errors " - "detected in %s block\n", - obj->err_data.ue_count, - get_ras_block_str(&info->head)); - } - } + amdgpu_ras_error_generate_report(adev, info, &err_data); - return 0; +out_fini_err_data: + amdgpu_ras_error_data_fini(&err_data); + + return ret; } int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, @@ -1744,12 +1812,16 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, struct amdgpu_iv_entry *entry) { struct ras_ih_data *data = &obj->ih_data; - struct ras_err_data err_data = {0, 0, 0, NULL}; + struct ras_err_data err_data; int ret; if (!data->cb) return; + ret = amdgpu_ras_error_data_init(&err_data); + if (ret) + return; + /* Let IP handle its data, maybe we need get the output * from the callback to update the error type/count, etc */ @@ -1766,6 +1838,8 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, obj->err_data.ue_count += err_data.ue_count; obj->err_data.ce_count += err_data.ce_count; } + + amdgpu_ras_error_data_fini(&err_data); } static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) @@ -3383,3 +3457,128 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev, WREG32(err_status_hi_offset, 0); } } + +int amdgpu_ras_error_data_init(struct ras_err_data *err_data) { + memset(err_data, 0, sizeof(*err_data)); + + INIT_LIST_HEAD(&err_data->err_node_list); + + return 0; +} + +static void amdgpu_ras_error_node_release(struct ras_err_node +*err_node) { + if (!err_node) + return; + + list_del(&err_node->node); + kvfree(err_node); +} + +void amdgpu_ras_error_data_fini(struct ras_err_data *err_data) { + struct ras_err_node *err_node, *tmp; + + list_for_each_entry_safe(err_node, tmp, &err_data->err_node_list, node) { + amdgpu_ras_error_node_release(err_node); + list_del(&err_node->node); + } +} + +static struct ras_err_node *amdgpu_ras_error_find_node_by_id(struct ras_err_data *err_data, + struct amdgpu_smuio_mcm_config_info *mcm_info) { + struct ras_err_node *err_node; + struct amdgpu_smuio_mcm_config_info *ref_id; + + if (!err_data || !mcm_info) + return NULL; + + for_each_ras_error(err_node, err_data) { + ref_id = &err_node->err_info.mcm_info; + if ((mcm_info->socket_id >= 0 && mcm_info->socket_id != ref_id->socket_id) || + (mcm_info->die_id >= 0 && mcm_info->die_id != ref_id->die_id)) + continue; + + return err_node; + } + + return NULL; +} + +static struct ras_err_node *amdgpu_ras_error_node_new(void) { + struct ras_err_node *err_node; + + err_node = kvzalloc(sizeof(*err_node), GFP_KERNEL); + if (!err_node) + return NULL; + + INIT_LIST_HEAD(&err_node->node); + + return err_node; +} + +static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data, + struct amdgpu_smuio_mcm_config_info *mcm_info) { + struct ras_err_node *err_node; + + err_node = amdgpu_ras_error_find_node_by_id(err_data, mcm_info); + if (err_node) + return &err_node->err_info; + + err_node = amdgpu_ras_error_node_new(); + if (!err_node) + return NULL; + + memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info)); + + err_data->err_list_count++; + list_add_tail(&err_node->node, &err_data->err_node_list); + + return &err_node->err_info; +} + +int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, + struct amdgpu_smuio_mcm_config_info *mcm_info, uint64_t count) { + struct ras_err_info *err_info; + + if (!err_data || !mcm_info) + return -EINVAL; + + if (!count) + return 0; + + err_info = amdgpu_ras_error_get_info(err_data, mcm_info); + if (!err_info) + return -EINVAL; + + err_info->ue_count += count; + err_data->ue_count += count; + + return 0; +} + +int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, + struct amdgpu_smuio_mcm_config_info *mcm_info, uint64_t count) { + struct ras_err_info *err_info; + + if (!err_data || !mcm_info) + return -EINVAL; + + if (!count) + return 0; + + err_info = amdgpu_ras_error_get_info(err_data, mcm_info); + if (!err_info) + return -EINVAL; + + err_info->ce_count += count; + err_data->ce_count += count; + + return 0; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 46b0dcf846dc..d6a37d5b9809 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -28,6 +28,7 @@ #include <linux/list.h> #include "ta_ras_if.h" #include "amdgpu_ras_eeprom.h" +#include "amdgpu_smuio.h" struct amdgpu_iv_entry; @@ -443,13 +444,30 @@ struct ras_fs_data { char debugfs_name[32]; }; + +struct ras_err_info { + struct amdgpu_smuio_mcm_config_info mcm_info; + uint64_t ce_count; + uint64_t ue_count; +}; + +struct ras_err_node { + struct list_head node; + struct ras_err_info err_info; +}; + struct ras_err_data { unsigned long ue_count; unsigned long ce_count; unsigned long err_addr_cnt; struct eeprom_table_record *err_addr; + uint32_t err_list_count; + struct list_head err_node_list; }; +#define for_each_ras_error(err_node, err_data) \ + list_for_each_entry(err_node, &(err_data)->err_node_list, node) + struct ras_err_handler_data { /* point to bad page records array */ struct eeprom_table_record *bps; @@ -773,4 +791,14 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev, const struct amdgpu_ras_err_status_reg_entry *reg_list, uint32_t reg_list_size, uint32_t instance); + +int amdgpu_ras_error_data_init(struct ras_err_data *err_data); void +amdgpu_ras_error_data_fini(struct ras_err_data *err_data); int +amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, + struct amdgpu_smuio_mcm_config_info *mcm_info, uint64_t count); +int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, + struct amdgpu_smuio_mcm_config_info *mcm_info, uint64_t count); + #endif + + diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_smuio.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_smuio.h index 5910d50ac74d..ff4435181055 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_smuio.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_smuio.h @@ -30,6 +30,11 @@ enum amdgpu_pkg_type { AMDGPU_PKG_TYPE_UNKNOWN, }; +struct amdgpu_smuio_mcm_config_info { + int socket_id; + int die_id; +}; + struct amdgpu_smuio_funcs { u32 (*get_rom_index_offset)(struct amdgpu_device *adev); u32 (*get_rom_data_offset)(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 24fcc9a2e422..f74347cc087a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -45,8 +45,12 @@ static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev, int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst) { - struct ras_err_data err_data = {0, 0, 0, NULL}; - int ret = AMDGPU_RAS_FAIL; + struct ras_err_data err_data; + int ret; + + ret = amdgpu_ras_error_data_init(&err_data); + if (ret) + return ret; err_data.err_addr = kcalloc(adev->umc.max_ras_err_cnt_per_query, @@ -54,7 +58,8 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, if (!err_data.err_addr) { dev_warn(adev->dev, "Failed to alloc memory for umc error record in MCA notifier!\n"); - return AMDGPU_RAS_FAIL; + ret = AMDGPU_RAS_FAIL; + goto out_fini_err_data; } /* @@ -63,7 +68,7 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr, ch_inst, umc_inst); if (ret) - goto out; + goto out_free_err_addr; if (amdgpu_bad_page_threshold != 0) { amdgpu_ras_add_bad_pages(adev, err_data.err_addr, @@ -71,8 +76,12 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, amdgpu_ras_save_bad_pages(adev, NULL); } -out: +out_free_err_addr: kfree(err_data.err_addr); + +out_fini_err_data: + amdgpu_ras_error_data_fini(&err_data); + return ret; } @@ -182,18 +191,24 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset) } if (!amdgpu_sriov_vf(adev)) { - struct ras_err_data err_data = {0, 0, 0, NULL}; + struct ras_err_data err_data; struct ras_common_if head = { .block = AMDGPU_RAS_BLOCK__UMC, }; struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); + ret = amdgpu_ras_error_data_init(&err_data); + if (ret) + return ret; + ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset); if (ret == AMDGPU_RAS_SUCCESS && obj) { obj->err_data.ue_count += err_data.ue_count; obj->err_data.ce_count += err_data.ce_count; } + + amdgpu_ras_error_data_fini(&err_data); } else { if (adev->virt.ops && adev->virt.ops->ras_poison_handler) adev->virt.ops->ras_poison_handler(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c index 7d6d7734dbec..6d24c84924cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c @@ -365,9 +365,12 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device { uint32_t bif_doorbell_intr_cntl; struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if); - struct ras_err_data err_data = {0, 0, 0, NULL}; + struct ras_err_data err_data; struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + if (amdgpu_ras_error_data_init(&err_data)) + return; + if (adev->asic_type == CHIP_ALDEBARAN) bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL_ALDE); else @@ -418,6 +421,8 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device */ amdgpu_ras_reset_gpu(adev); } + + amdgpu_ras_error_data_fini(&err_data); } static void nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c index cc034a740605..eccb006e78aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c @@ -560,9 +560,12 @@ static void nbio_v7_9_handle_ras_controller_intr_no_bifring(struct amdgpu_device { uint32_t bif_doorbell_intr_cntl; struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if); - struct ras_err_data err_data = {0, 0, 0, NULL}; + struct ras_err_data err_data; struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + if (amdgpu_ras_error_data_init(&err_data)) + return; + bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL); if (REG_GET_FIELD(bif_doorbell_intr_cntl, @@ -607,6 +610,8 @@ static void nbio_v7_9_handle_ras_controller_intr_no_bifring(struct amdgpu_device */ amdgpu_ras_reset_gpu(adev); } + + amdgpu_ras_error_data_fini(&err_data); } static void nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_device *adev) -- 2.34.1