add amdgpu ras POSION_CONSUMPTION event id support. Signed-off-by: Yang Wang <kevinyang.wang@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 +++++++++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 15 ++++++++++++--- 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 8a98611d2353..11f8c37a97ef 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2076,10 +2076,17 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, obj->head.block, 0); struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION; + u64 event_id; + int ret; if (!block_obj || !con) return; + ret = amdgpu_ras_mark_ras_event(adev, type); + if (ret) + return; + /* both query_poison_status and handle_poison_consumption are optional, * but at least one of them should be implemented if we need poison * consumption handler @@ -2104,8 +2111,10 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * * For RMA case, amdgpu_umc_poison_handler will handle gpu reset. */ if (poison_stat && !con->is_rma) { - dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n", - block_obj->ras_comm.name); + event_id = amdgpu_ras_acquire_event_id(adev, type); + RAS_EVENT_LOG(adev, event_id, + "GPU reset for %s RAS poison consumption is issued!\n", + block_obj->ras_comm.name); amdgpu_ras_reset_gpu(adev); } @@ -2498,7 +2507,7 @@ static enum ras_event_type amdgpu_ras_get_recovery_event(struct amdgpu_device *a if (amdgpu_ras_intr_triggered()) return RAS_EVENT_TYPE_ISR; else - return RAS_EVENT_TYPE_INVALID; + return RAS_EVENT_TYPE_POISON_CONSUMPTION; } static void amdgpu_ras_do_recovery(struct work_struct *work) @@ -3975,6 +3984,7 @@ u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type switch (type) { case RAS_EVENT_TYPE_ISR: case RAS_EVENT_TYPE_POISON_CREATION: + case RAS_EVENT_TYPE_POISON_CONSUMPTION: event_mgr = __get_ras_event_mgr(adev); if (!event_mgr) return RAS_EVENT_INVALID_ID; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 1343cfbc913b..6086da67fa4e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -433,6 +433,7 @@ enum ras_event_type { RAS_EVENT_TYPE_INVALID = 0, RAS_EVENT_TYPE_ISR, RAS_EVENT_TYPE_POISON_CREATION, + RAS_EVENT_TYPE_POISON_CONSUMPTION, RAS_EVENT_TYPE_COUNT, }; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index 816800555f7f..8a10a0e42846 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -27,6 +27,7 @@ #include "soc15_int.h" #include "kfd_device_queue_manager.h" #include "kfd_smi_events.h" +#include "amdgpu_ras.h" /* * GFX9 SQ Interrupts @@ -144,9 +145,11 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, uint16_t pasid, uint16_t client_id) { enum amdgpu_ras_block block = 0; - int old_poison; uint32_t reset = 0; struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION; + u64 event_id; + int old_poison, ret; if (!p) return; @@ -191,10 +194,16 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, return; } + ret = amdgpu_ras_mark_ras_event(dev->adev, type); + if (ret) + return; + kfd_signal_poison_consumed_event(dev, pasid); - dev_warn(dev->adev->dev, - "poison is consumed by client %d, kick off gpu reset flow\n", client_id); + event_id = amdgpu_ras_acquire_event_id(dev->adev, type); + + RAS_EVENT_LOG(dev->adev, event_id, + "poison is consumed by client %d, kick off gpu reset flow\n", client_id); amdgpu_amdkfd_ras_pasid_poison_consumption_handler(dev->adev, block, pasid, NULL, NULL, reset); -- 2.34.1