[AMD Official Use Only - General] Reviewed-by: Tao Zhou <tao.zhou1@xxxxxxx> > -----Original Message----- > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Yang > Wang > Sent: Thursday, March 14, 2024 4:12 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Zhang, Hawking > <Hawking.Zhang@xxxxxxx> > Subject: [PATCH] drm/amdgpu: add ras event id support > > add amdgpu ras event id support to better distinguish different error information > sources in dmesg logs. > > the following log will be identify by event id: > {event_id} interrupt to inform RAS event {event_id} ACA logs {event_id} errors > statistic since from current injection/error query {event_id} errors statistic since > from gpu load > > Signed-off-by: Yang Wang <kevinyang.wang@xxxxxxx> > Reviewed-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 32 ++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h | 3 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 203 +++++++++++++++-------- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 30 ++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 1 + > drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 10 +- > 6 files changed, 191 insertions(+), 88 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c > index 24ad4b97177b..0734490347db 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c > @@ -210,22 +210,26 @@ int amdgpu_mca_smu_set_debug_mode(struct > amdgpu_device *adev, bool enable) > return -EOPNOTSUPP; > } > > -static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, > int idx, struct mca_bank_entry *entry) > +static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, > int idx, struct mca_bank_entry *entry, > + struct ras_query_context *qctx) > { > - dev_info(adev->dev, HW_ERR "Accelerator Check Architecture events > logged\n"); > - dev_info(adev->dev, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n", > - idx, entry->regs[MCA_REG_IDX_STATUS]); > - dev_info(adev->dev, HW_ERR "aca entry[%02d].ADDR=0x%016llx\n", > - idx, entry->regs[MCA_REG_IDX_ADDR]); > - dev_info(adev->dev, HW_ERR "aca entry[%02d].MISC0=0x%016llx\n", > - idx, entry->regs[MCA_REG_IDX_MISC0]); > - dev_info(adev->dev, HW_ERR "aca entry[%02d].IPID=0x%016llx\n", > - idx, entry->regs[MCA_REG_IDX_IPID]); > - dev_info(adev->dev, HW_ERR "aca entry[%02d].SYND=0x%016llx\n", > - idx, entry->regs[MCA_REG_IDX_SYND]); > + u64 event_id = qctx->event_id; > + > + RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check > Architecture events logged\n"); > + RAS_EVENT_LOG(adev, event_id, HW_ERR "aca > entry[%02d].STATUS=0x%016llx\n", > + idx, entry->regs[MCA_REG_IDX_STATUS]); > + RAS_EVENT_LOG(adev, event_id, HW_ERR "aca > entry[%02d].ADDR=0x%016llx\n", > + idx, entry->regs[MCA_REG_IDX_ADDR]); > + RAS_EVENT_LOG(adev, event_id, HW_ERR "aca > entry[%02d].MISC0=0x%016llx\n", > + idx, entry->regs[MCA_REG_IDX_MISC0]); > + RAS_EVENT_LOG(adev, event_id, HW_ERR "aca > entry[%02d].IPID=0x%016llx\n", > + idx, entry->regs[MCA_REG_IDX_IPID]); > + RAS_EVENT_LOG(adev, event_id, HW_ERR "aca > entry[%02d].SYND=0x%016llx\n", > + idx, entry->regs[MCA_REG_IDX_SYND]); > } > > -int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum > amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data > *err_data) > +int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum > amdgpu_ras_block blk, enum amdgpu_mca_error_type type, > + struct ras_err_data *err_data, struct > ras_query_context *qctx) > { > struct amdgpu_smuio_mcm_config_info mcm_info; > struct ras_err_addr err_addr = {0}; > @@ -244,7 +248,7 @@ int amdgpu_mca_smu_log_ras_error(struct > amdgpu_device *adev, enum amdgpu_ras_blo > list_for_each_entry(node, &mca_set.list, node) { > entry = &node->entry; > > - amdgpu_mca_smu_mca_bank_dump(adev, i++, entry); > + amdgpu_mca_smu_mca_bank_dump(adev, i++, entry, qctx); > > count = 0; > ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type, > entry, &count); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h > index b964110ed1e0..e5bf07ce3451 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h > @@ -169,6 +169,7 @@ void amdgpu_mca_smu_debugfs_init(struct > amdgpu_device *adev, struct dentry *root void > amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set); int > amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct > mca_bank_entry *entry); void amdgpu_mca_bank_set_release(struct > mca_bank_set *mca_set); -int amdgpu_mca_smu_log_ras_error(struct > amdgpu_device *adev, enum amdgpu_ras_block blk, enum > amdgpu_mca_error_type type, struct ras_err_data *err_data); > +int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum > amdgpu_ras_block blk, enum amdgpu_mca_error_type type, > + struct ras_err_data *err_data, struct > ras_query_context *qctx); > > #endif > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 8ebab6f22e5a..ef87f107c942 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -1045,6 +1045,7 @@ static void amdgpu_ras_get_ecc_info(struct > amdgpu_device *adev, struct ras_err_d static void > amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, > struct ras_manager *ras_mgr, > struct ras_err_data *err_data, > + struct ras_query_context *qctx, > const char *blk_name, > bool is_ue, > bool is_de) > @@ -1052,27 +1053,28 @@ static void > amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, > struct amdgpu_smuio_mcm_config_info *mcm_info; > struct ras_err_node *err_node; > struct ras_err_info *err_info; > + u64 event_id = qctx->event_id; > > if (is_ue) { > for_each_ras_error(err_node, err_data) { > err_info = &err_node->err_info; > mcm_info = &err_info->mcm_info; > if (err_info->ue_count) { > - dev_info(adev->dev, "socket: %d, die: %d, " > - "%lld new uncorrectable hardware > errors detected in %s block\n", > - mcm_info->socket_id, > - mcm_info->die_id, > - err_info->ue_count, > - blk_name); > + RAS_EVENT_LOG(adev, event_id, "socket: %d, > die: %d, " > + "%lld new uncorrectable hardware > errors detected in %s block\n", > + mcm_info->socket_id, > + mcm_info->die_id, > + err_info->ue_count, > + blk_name); > } > } > > for_each_ras_error(err_node, &ras_mgr->err_data) { > err_info = &err_node->err_info; > mcm_info = &err_info->mcm_info; > - dev_info(adev->dev, "socket: %d, die: %d, " > - "%lld uncorrectable hardware errors detected > in total in %s block\n", > - mcm_info->socket_id, mcm_info->die_id, > err_info->ue_count, blk_name); > + RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " > + "%lld uncorrectable hardware errors > detected in total in %s block\n", > + mcm_info->socket_id, mcm_info->die_id, > err_info->ue_count, > +blk_name); > } > > } else { > @@ -1081,44 +1083,44 @@ static void > amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, > err_info = &err_node->err_info; > mcm_info = &err_info->mcm_info; > if (err_info->de_count) { > - dev_info(adev->dev, "socket: %d, > die: %d, " > - "%lld new deferred hardware > errors detected in %s block\n", > - mcm_info->socket_id, > - mcm_info->die_id, > - err_info->de_count, > - blk_name); > + RAS_EVENT_LOG(adev, event_id, > "socket: %d, die: %d, " > + "%lld new deferred > hardware errors detected in %s block\n", > + mcm_info->socket_id, > + mcm_info->die_id, > + err_info->de_count, > + blk_name); > } > } > > for_each_ras_error(err_node, &ras_mgr->err_data) { > err_info = &err_node->err_info; > mcm_info = &err_info->mcm_info; > - dev_info(adev->dev, "socket: %d, die: %d, " > - "%lld deferred hardware errors > detected in total in %s block\n", > - mcm_info->socket_id, mcm_info- > >die_id, > - err_info->de_count, blk_name); > + RAS_EVENT_LOG(adev, event_id, "socket: %d, > die: %d, " > + "%lld deferred hardware errors > detected in total in %s block\n", > + mcm_info->socket_id, mcm_info- > >die_id, > + err_info->de_count, blk_name); > } > } else { > for_each_ras_error(err_node, err_data) { > err_info = &err_node->err_info; > mcm_info = &err_info->mcm_info; > if (err_info->ce_count) { > - dev_info(adev->dev, "socket: %d, > die: %d, " > - "%lld new correctable > hardware errors detected in %s block\n", > - mcm_info->socket_id, > - mcm_info->die_id, > - err_info->ce_count, > - blk_name); > + RAS_EVENT_LOG(adev, event_id, > "socket: %d, die: %d, " > + "%lld new correctable > hardware errors detected in %s block\n", > + mcm_info->socket_id, > + mcm_info->die_id, > + err_info->ce_count, > + blk_name); > } > } > > for_each_ras_error(err_node, &ras_mgr->err_data) { > err_info = &err_node->err_info; > mcm_info = &err_info->mcm_info; > - dev_info(adev->dev, "socket: %d, die: %d, " > - "%lld correctable hardware errors > detected in total in %s block\n", > - mcm_info->socket_id, mcm_info- > >die_id, > - err_info->ce_count, blk_name); > + RAS_EVENT_LOG(adev, event_id, "socket: %d, > die: %d, " > + "%lld correctable hardware errors > detected in total in %s block\n", > + mcm_info->socket_id, mcm_info- > >die_id, > + err_info->ce_count, blk_name); > } > } > } > @@ -1131,77 +1133,79 @@ static inline bool err_data_has_source_info(struct > ras_err_data *data) > > static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, > struct ras_query_if *query_if, > - struct ras_err_data *err_data) > + struct ras_err_data *err_data, > + struct ras_query_context *qctx) > { > struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if- > >head); > const char *blk_name = get_ras_block_str(&query_if->head); > + u64 event_id = qctx->event_id; > > if (err_data->ce_count) { > if (err_data_has_source_info(err_data)) { > - amdgpu_ras_error_print_error_data(adev, ras_mgr, > err_data, > + amdgpu_ras_error_print_error_data(adev, ras_mgr, > err_data, qctx, > blk_name, false, > false); > } else if (!adev->aid_mask && > adev->smuio.funcs && > adev->smuio.funcs->get_socket_id && > adev->smuio.funcs->get_die_id) { > - dev_info(adev->dev, "socket: %d, die: %d " > - "%ld correctable hardware errors " > - "detected in %s block\n", > - adev->smuio.funcs->get_socket_id(adev), > - adev->smuio.funcs->get_die_id(adev), > - ras_mgr->err_data.ce_count, > - blk_name); > + RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d " > + "%ld correctable hardware errors " > + "detected in %s block\n", > + adev->smuio.funcs->get_socket_id(adev), > + adev->smuio.funcs->get_die_id(adev), > + ras_mgr->err_data.ce_count, > + blk_name); > } else { > - dev_info(adev->dev, "%ld correctable hardware errors " > - "detected in %s block\n", > - ras_mgr->err_data.ce_count, > - blk_name); > + RAS_EVENT_LOG(adev, event_id, "%ld correctable > hardware errors " > + "detected in %s block\n", > + ras_mgr->err_data.ce_count, > + blk_name); > } > } > > if (err_data->ue_count) { > if (err_data_has_source_info(err_data)) { > - amdgpu_ras_error_print_error_data(adev, ras_mgr, > err_data, > + amdgpu_ras_error_print_error_data(adev, ras_mgr, > err_data, qctx, > blk_name, true, > false); > } else if (!adev->aid_mask && > adev->smuio.funcs && > adev->smuio.funcs->get_socket_id && > adev->smuio.funcs->get_die_id) { > - dev_info(adev->dev, "socket: %d, die: %d " > - "%ld uncorrectable hardware errors " > - "detected in %s block\n", > - adev->smuio.funcs->get_socket_id(adev), > - adev->smuio.funcs->get_die_id(adev), > - ras_mgr->err_data.ue_count, > - blk_name); > + RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d " > + "%ld uncorrectable hardware errors " > + "detected in %s block\n", > + adev->smuio.funcs->get_socket_id(adev), > + adev->smuio.funcs->get_die_id(adev), > + ras_mgr->err_data.ue_count, > + blk_name); > } else { > - dev_info(adev->dev, "%ld uncorrectable hardware errors > " > - "detected in %s block\n", > - ras_mgr->err_data.ue_count, > - blk_name); > + RAS_EVENT_LOG(adev, event_id, "%ld uncorrectable > hardware errors " > + "detected in %s block\n", > + ras_mgr->err_data.ue_count, > + blk_name); > } > } > > if (err_data->de_count) { > if (err_data_has_source_info(err_data)) { > - amdgpu_ras_error_print_error_data(adev, ras_mgr, > err_data, > + amdgpu_ras_error_print_error_data(adev, ras_mgr, > err_data, qctx, > blk_name, false, > true); > } else if (!adev->aid_mask && > adev->smuio.funcs && > adev->smuio.funcs->get_socket_id && > adev->smuio.funcs->get_die_id) { > - dev_info(adev->dev, "socket: %d, die: %d " > - "%ld deferred hardware errors " > - "detected in %s block\n", > - adev->smuio.funcs->get_socket_id(adev), > - adev->smuio.funcs->get_die_id(adev), > - ras_mgr->err_data.de_count, > - blk_name); > + RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d " > + "%ld deferred hardware errors " > + "detected in %s block\n", > + adev->smuio.funcs->get_socket_id(adev), > + adev->smuio.funcs->get_die_id(adev), > + ras_mgr->err_data.de_count, > + blk_name); > } else { > - dev_info(adev->dev, "%ld deferred hardware errors " > - "detected in %s block\n", > - ras_mgr->err_data.de_count, > - blk_name); > + RAS_EVENT_LOG(adev, event_id, "%ld deferred > hardware errors " > + "detected in %s block\n", > + ras_mgr->err_data.de_count, > + blk_name); > } > } > } > @@ -1294,6 +1298,7 @@ ssize_t amdgpu_ras_aca_sysfs_read(struct device > *dev, struct device_attribute *a static int > amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, > struct ras_query_if *info, > struct ras_err_data *err_data, > + struct ras_query_context > *qctx, > unsigned int > error_query_mode) > { > enum amdgpu_ras_block blk = info ? info->head.block : > AMDGPU_RAS_BLOCK_COUNT; @@ -1338,8 +1343,8 @@ static int > amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, > return ret; > } else { > /* FIXME: add code to check return value later */ > - amdgpu_mca_smu_log_ras_error(adev, blk, > AMDGPU_MCA_ERROR_TYPE_UE, err_data); > - amdgpu_mca_smu_log_ras_error(adev, blk, > AMDGPU_MCA_ERROR_TYPE_CE, err_data); > + amdgpu_mca_smu_log_ras_error(adev, blk, > AMDGPU_MCA_ERROR_TYPE_UE, err_data, qctx); > + amdgpu_mca_smu_log_ras_error(adev, blk, > AMDGPU_MCA_ERROR_TYPE_CE, > +err_data, qctx); > } > } > > @@ -1351,6 +1356,7 @@ int amdgpu_ras_query_error_status(struct > amdgpu_device *adev, struct ras_query_i { > struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); > struct ras_err_data err_data; > + struct ras_query_context qctx; > unsigned int error_query_mode; > int ret; > > @@ -1364,8 +1370,12 @@ int amdgpu_ras_query_error_status(struct > amdgpu_device *adev, struct ras_query_i > if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode)) > return -EINVAL; > > + memset(&qctx, 0, sizeof(qctx)); > + qctx.event_id = amdgpu_ras_acquire_event_id(adev, > amdgpu_ras_intr_triggered() ? > + RAS_EVENT_TYPE_ISR : > RAS_EVENT_TYPE_INVALID); > ret = amdgpu_ras_query_error_status_helper(adev, info, > &err_data, > + &qctx, > error_query_mode); > if (ret) > goto out_fini_err_data; > @@ -1376,7 +1386,7 @@ int amdgpu_ras_query_error_status(struct > amdgpu_device *adev, struct ras_query_i > info->ce_count = obj->err_data.ce_count; > info->de_count = obj->err_data.de_count; > > - amdgpu_ras_error_generate_report(adev, info, &err_data); > + amdgpu_ras_error_generate_report(adev, info, &err_data, &qctx); > > out_fini_err_data: > amdgpu_ras_error_data_fini(&err_data); > @@ -3036,6 +3046,31 @@ static int amdgpu_get_ras_schema(struct > amdgpu_device *adev) > AMDGPU_RAS_ERROR__PARITY; > } > > +static void ras_event_mgr_init(struct ras_event_manager *mgr) { > + int i; > + > + for (i = 0; i < ARRAY_SIZE(mgr->seqnos); i++) > + atomic64_set(&mgr->seqnos[i], 0); > +} > + > +static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev) { > + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); > + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); > + > + ras->event_mgr = hive ? &hive->event_mgr : &ras->__event_mgr; > + > + /* init event manager with node 0 on xgmi system */ > + if (!amdgpu_in_reset(adev)) { > + if (!hive || adev->gmc.xgmi.node_id == 0) > + ras_event_mgr_init(ras->event_mgr); > + } > + > + if (hive) > + amdgpu_put_xgmi_hive(hive); > +} > + > int amdgpu_ras_init(struct amdgpu_device *adev) { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -3356,6 > +3391,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) > if (amdgpu_sriov_vf(adev)) > return 0; > > + amdgpu_ras_event_mgr_init(adev); > + > if (amdgpu_aca_is_enabled(adev)) { > if (amdgpu_in_reset(adev)) > r = amdgpu_aca_reset(adev); > @@ -3472,13 +3509,37 @@ void amdgpu_ras_set_fed(struct amdgpu_device > *adev, bool status) > atomic_set(&ras->fed, !!status); > } > > +bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id) { > + return !(id & BIT_ULL(63)); > +} > + > +u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum > +ras_event_type type) { > + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); > + u64 id; > + > + switch (type) { > + case RAS_EVENT_TYPE_ISR: > + id = (u64)atomic64_read(&ras->event_mgr->seqnos[type]); > + break; > + case RAS_EVENT_TYPE_INVALID: > + default: > + id = BIT_ULL(63) | 0ULL; > + break; > + } > + > + return id; > +} > + > void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) { > if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { > struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); > + u64 event_id = > +(u64)atomic64_inc_return(&ras->event_mgr->seqnos[RAS_EVENT_TYPE_ISR]); > > - dev_info(adev->dev, "uncorrectable hardware error" > - "(ERREVENT_ATHUB_INTERRUPT) detected!\n"); > + RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware > error" > + "(ERREVENT_ATHUB_INTERRUPT) detected!\n"); > > ras->gpu_reset_flags |= > AMDGPU_RAS_GPU_RESET_MODE1_RESET; > amdgpu_ras_reset_gpu(adev); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index e0f8ce9d8440..64788ae7d85d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -64,6 +64,14 @@ struct amdgpu_iv_entry; > /* The high three bits indicates socketid */ #define > AMDGPU_RAS_GET_FEATURES(val) ((val) & > ~AMDGPU_RAS_FEATURES_SOCKETID_MASK) > > +#define RAS_EVENT_LOG(adev, id, fmt, ...) \ > +do { \ > + if (amdgpu_ras_event_id_is_valid((adev), (id))) \ > + dev_info((adev)->dev, "{%llu}" fmt, (id), ##__VA_ARGS__); \ > + else \ > + dev_info((adev)->dev, fmt, ##__VA_ARGS__); \ > +} while (0) > + > enum amdgpu_ras_block { > AMDGPU_RAS_BLOCK__UMC = 0, > AMDGPU_RAS_BLOCK__SDMA, > @@ -419,6 +427,21 @@ struct umc_ecc_info { > int record_ce_addr_supported; > }; > > +enum ras_event_type { > + RAS_EVENT_TYPE_INVALID = -1, > + RAS_EVENT_TYPE_ISR = 0, > + RAS_EVENT_TYPE_COUNT, > +}; > + > +struct ras_event_manager { > + atomic64_t seqnos[RAS_EVENT_TYPE_COUNT]; }; > + > +struct ras_query_context { > + enum ras_event_type type; > + u64 event_id; > +}; > + > struct amdgpu_ras { > /* ras infrastructure */ > /* for ras itself. */ > @@ -479,6 +502,11 @@ struct amdgpu_ras { > atomic_t page_retirement_req_cnt; > /* Fatal error detected flag */ > atomic_t fed; > + > + /* RAS event manager */ > + struct ras_event_manager __event_mgr; > + struct ras_event_manager *event_mgr; > + > }; > > struct ras_fs_data { > @@ -879,4 +907,6 @@ void amdgpu_ras_del_mca_err_addr(struct ras_err_info > *err_info, void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status); > bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev); > > +bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id); > +u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum > +ras_event_type type); > #endif > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h > index 1592c63b3099..a3bfc16de6d4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h > @@ -44,6 +44,7 @@ struct amdgpu_hive_info { > > struct amdgpu_reset_domain *reset_domain; > atomic_t ras_recovery; > + struct ras_event_manager event_mgr; > }; > > struct amdgpu_pcs_ras_field { > diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c > b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c > index 77af4e25ff46..4a02e1f041da 100644 > --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c > @@ -404,10 +404,16 @@ static int umc_v12_0_err_cnt_init_per_channel(struct > amdgpu_device *adev, static void > umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device *adev, > void *ras_error_status) > { > + struct ras_query_context qctx; > + > + memset(&qctx, 0, sizeof(qctx)); > + qctx.event_id = amdgpu_ras_acquire_event_id(adev, > amdgpu_ras_intr_triggered() ? > + RAS_EVENT_TYPE_ISR : > RAS_EVENT_TYPE_INVALID); > + > amdgpu_mca_smu_log_ras_error(adev, > - AMDGPU_RAS_BLOCK__UMC, > AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status); > + AMDGPU_RAS_BLOCK__UMC, > AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status, > +&qctx); > amdgpu_mca_smu_log_ras_error(adev, > - AMDGPU_RAS_BLOCK__UMC, > AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status); > + AMDGPU_RAS_BLOCK__UMC, > AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status, > +&qctx); > } > > static void umc_v12_0_ecc_info_query_ras_error_address(struct > amdgpu_device *adev, > -- > 2.34.1