[AMD Official Use Only - AMD Internal Distribution Only] We will need to change RAS_EVENT_TYPE_ISR to RAS_EVENT_TYPE_FATAL to differentiate the upcoming correctable error ISR event. And please also align the terminology in code to change recovery_event to fatal_error_event, i.e., amdgpu_ras_get_recovery_event -> amdgpu_ras_get_fatal_error_event. Regards, Hawking -----Original Message----- From: Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx> Sent: Wednesday, July 3, 2024 13:52 To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx> Subject: [PATCH 1/4] drm/amdgpu: refine amdgpu ras event id core code - use unified event id to manage ras events - add a new function amdgpu_ras_query_error_status_with_event() to accept event type as parameter. Signed-off-by: Yang Wang <kevinyang.wang@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 4 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 89 ++++++++++++++++++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 18 +++-- drivers/gpu/drm/amd/amdgpu_ras.c | 0 5 files changed, 88 insertions(+), 25 deletions(-) create mode 100644 drivers/gpu/drm/amd/amdgpu_ras.c diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index 7945173321a2..19158cc30f31 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -119,7 +119,7 @@ static struct aca_regs_dump { static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, struct aca_bank *bank, struct ras_query_context *qctx) { - u64 event_id = qctx ? qctx->event_id : 0ULL; + u64 event_id = qctx ? qctx->evid.event_id : RAS_EVENT_INVALID_ID; int i; RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c index 9d3a3c778504..2542bd7aa7c7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c @@ -274,7 +274,7 @@ int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable) static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry, struct ras_query_context *qctx) { - u64 event_id = qctx->event_id; + u64 event_id = qctx ? qctx->evid.event_id : RAS_EVENT_INVALID_ID; RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n"); RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n", @@ -543,7 +543,7 @@ static int mca_dump_show(struct seq_file *m, enum amdgpu_mca_error_type type) amdgpu_mca_bank_set_init(&mca_set); - qctx.event_id = 0ULL; + qctx.evid.event_id = RAS_EVENT_INVALID_ID; ret = amdgpu_mca_smu_get_mca_set(adev, type, &mca_set, &qctx); if (ret) goto err_free_mca_set; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 53b5ac2d7bed..45ac82a34d49 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1055,7 +1055,7 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, struct amdgpu_smuio_mcm_config_info *mcm_info; struct ras_err_node *err_node; struct ras_err_info *err_info; - u64 event_id = qctx->event_id; + u64 event_id = qctx->evid.event_id; if (is_ue) { for_each_ras_error(err_node, err_data) { @@ -1140,7 +1140,7 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, { struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head); const char *blk_name = get_ras_block_str(&query_if->head); - u64 event_id = qctx->event_id; + u64 event_id = qctx->evid.event_id; if (err_data->ce_count) { if (err_data_has_source_info(err_data)) { @@ -1366,7 +1366,9 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, } /* query/inject/cure begin */ -int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info) +static int amdgpu_ras_query_error_status_with_event(struct amdgpu_device *adev, + struct ras_query_if *info, + enum ras_event_type type) { struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); struct ras_err_data err_data; @@ -1385,8 +1387,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i return -EINVAL; memset(&qctx, 0, sizeof(qctx)); - qctx.event_id = amdgpu_ras_acquire_event_id(adev, amdgpu_ras_intr_triggered() ? - RAS_EVENT_TYPE_ISR : RAS_EVENT_TYPE_INVALID); + qctx.evid.type = type; + qctx.evid.event_id = amdgpu_ras_acquire_event_id(adev, type); if (!down_read_trylock(&adev->reset_domain->sem)) { ret = -EIO; @@ -1415,6 +1417,11 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i return ret; } +int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct +ras_query_if *info) { + return amdgpu_ras_query_error_status_with_event(adev, info, +RAS_EVENT_TYPE_INVALID); } + int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block block) { @@ -2305,7 +2312,7 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) /* ih end */ /* traversal all IPs except NBIO to query error counter */ -static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) +static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev, +enum ras_event_type type) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj; @@ -2338,7 +2345,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) IP_VERSION(13, 0, 2))) continue; - amdgpu_ras_query_error_status(adev, &info); + amdgpu_ras_query_error_status_with_event(adev, &info, type); if (amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && @@ -2477,6 +2484,14 @@ bool amdgpu_ras_in_recovery(struct amdgpu_device *adev) return false; } +static enum ras_event_type amdgpu_ras_get_recovery_event(struct +amdgpu_device *adev) { + if (amdgpu_ras_intr_triggered()) + return RAS_EVENT_TYPE_ISR; + else + return RAS_EVENT_TYPE_INVALID; +} + static void amdgpu_ras_do_recovery(struct work_struct *work) { struct amdgpu_ras *ras = @@ -2485,6 +2500,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct amdgpu_device *adev = ras->adev; struct list_head device_list, *device_list_handle = NULL; struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + enum ras_event_type type; if (hive) { atomic_set(&hive->ras_recovery, 1); @@ -2512,10 +2528,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) device_list_handle = &device_list; } + type = amdgpu_ras_get_recovery_event(adev); list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) { amdgpu_ras_query_err_status(remote_adev); - amdgpu_ras_log_on_err_counter(remote_adev); + amdgpu_ras_log_on_err_counter(remote_adev, type); } } @@ -3406,8 +3423,11 @@ static void ras_event_mgr_init(struct ras_event_manager *mgr) { int i; - for (i = 0; i < ARRAY_SIZE(mgr->seqnos); i++) - atomic64_set(&mgr->seqnos[i], 0); + memset(mgr, 0, sizeof(*mgr)); + atomic64_set(&mgr->seqno, 0); + + for (i = 0; i < ARRAY_SIZE(mgr->last_seqno); i++) + mgr->last_seqno[i] = RAS_EVENT_INVALID_ID; } static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev) @@ -3907,23 +3927,52 @@ void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status) atomic_set(&ras->fed, !!status); } -bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id) +static struct ras_event_manager* __get_ras_event_mgr(struct +amdgpu_device *adev) { + struct amdgpu_ras *ras; + + ras = amdgpu_ras_get_context(adev); + if (!ras) + return NULL; + + return ras->event_mgr; +} + +int amdgpu_ras_mark_ras_event(struct amdgpu_device *adev, enum +ras_event_type type) { - return !(id & BIT_ULL(63)); + struct ras_event_manager *event_mgr; + + if (type >= RAS_EVENT_TYPE_COUNT) + return -EINVAL; + + event_mgr = __get_ras_event_mgr(adev); + if (!event_mgr) + return -EINVAL; + + event_mgr->last_seqno[type] = atomic64_inc_return(&event_mgr->seqno); + + return 0; } u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type) { - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + struct ras_event_manager *event_mgr; u64 id; + if (type >= RAS_EVENT_TYPE_COUNT) + return RAS_EVENT_INVALID_ID; + switch (type) { case RAS_EVENT_TYPE_ISR: - id = (u64)atomic64_read(&ras->event_mgr->seqnos[type]); + event_mgr = __get_ras_event_mgr(adev); + if (!event_mgr) + return RAS_EVENT_INVALID_ID; + + id = event_mgr->last_seqno[type]; break; case RAS_EVENT_TYPE_INVALID: default: - id = BIT_ULL(63) | 0ULL; + id = RAS_EVENT_INVALID_ID; break; } @@ -3934,7 +3983,13 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) { if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - u64 event_id = (u64)atomic64_inc_return(&ras->event_mgr->seqnos[RAS_EVENT_TYPE_ISR]); + enum ras_event_type type = RAS_EVENT_TYPE_ISR; + u64 event_id; + + if (amdgpu_ras_mark_ras_event(adev, type)) + return; + + event_id = amdgpu_ras_acquire_event_id(adev, type); RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error" "(ERREVENT_ATHUB_INTERRUPT) detected!\n"); @@ -4668,7 +4723,7 @@ void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, vaf.fmt = fmt; vaf.va = &args; - if (amdgpu_ras_event_id_is_valid(adev, event_id)) + if (RAS_EVENT_ID_IS_VALID(event_id)) dev_printk(KERN_INFO, adev->dev, "{%llu}%pV", event_id, &vaf); else dev_printk(KERN_INFO, adev->dev, "%pV", &vaf); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 0fa1148e6642..88df4be5d122 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -68,6 +68,9 @@ struct amdgpu_iv_entry; /* The high three bits indicates socketid */ #define AMDGPU_RAS_GET_FEATURES(val) ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK) +#define RAS_EVENT_INVALID_ID (BIT_ULL(63)) +#define RAS_EVENT_ID_IS_VALID(x) (!((x) & BIT_ULL(63))) + #define RAS_EVENT_LOG(adev, id, fmt, ...) \ amdgpu_ras_event_log_print((adev), (id), (fmt), ##__VA_ARGS__); @@ -427,20 +430,25 @@ struct umc_ecc_info { }; enum ras_event_type { - RAS_EVENT_TYPE_INVALID = -1, - RAS_EVENT_TYPE_ISR = 0, + RAS_EVENT_TYPE_INVALID = 0, + RAS_EVENT_TYPE_ISR, RAS_EVENT_TYPE_COUNT, }; struct ras_event_manager { - atomic64_t seqnos[RAS_EVENT_TYPE_COUNT]; + atomic64_t seqno; + u64 last_seqno[RAS_EVENT_TYPE_COUNT]; }; -struct ras_query_context { +struct ras_event_id { enum ras_event_type type; u64 event_id; }; +struct ras_query_context { + struct ras_event_id evid; +}; + typedef int (*pasid_notify)(struct amdgpu_device *adev, uint16_t pasid, void *data); @@ -947,8 +955,8 @@ void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status); bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev); -bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id); u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type); +int amdgpu_ras_mark_ras_event(struct amdgpu_device *adev, enum +ras_event_type type); int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn); diff --git a/drivers/gpu/drm/amd/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu_ras.c new file mode 100644 index 000000000000..e69de29bb2d1 -- 2.34.1