[AMD Official Use Only - AMD Internal Distribution Only] -----Original Message----- From: Zhou1, Tao <Tao.Zhou1@xxxxxxx> Sent: Wednesday, July 3, 2024 4:28 PM To: Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx> Subject: RE: [PATCH 3/4] drm/amdgpu: add ras POSION_CONSUMPTION event id support [AMD Official Use Only - AMD Internal Distribution Only] > -----Original Message----- > From: Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx> > Sent: Wednesday, July 3, 2024 1:52 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao > <Tao.Zhou1@xxxxxxx> > Subject: [PATCH 3/4] drm/amdgpu: add ras POSION_CONSUMPTION event id > support > > add amdgpu ras POSION_CONSUMPTION event id support. > > Signed-off-by: Yang Wang <kevinyang.wang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 +++++++++++++--- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + > drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 15 ++++++++++++--- > 3 files changed, 26 insertions(+), 6 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 8a98611d2353..11f8c37a97ef 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -2076,10 +2076,17 @@ static void > amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * > struct amdgpu_ras_block_object *block_obj = > amdgpu_ras_get_ras_block(adev, obj->head.block, 0); > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + enum ras_event_type type = > RAS_EVENT_TYPE_POISON_CONSUMPTION; > + u64 event_id; > + int ret; > > if (!block_obj || !con) > return; > > + ret = amdgpu_ras_mark_ras_event(adev, type); > + if (ret) [Tao] add warning? Or you can add it in amdgpu_ras_mark_ras_event. [Kevin]: Thanks suggestion, I will update v2 and add a warn log in function internal. Best Regards, Kevin > + return; > + > /* both query_poison_status and handle_poison_consumption are > optional, > * but at least one of them should be implemented if we need poison > * consumption handler > @@ -2104,8 +2111,10 @@ static void > amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * > * For RMA case, amdgpu_umc_poison_handler will handle gpu reset. > */ > if (poison_stat && !con->is_rma) { > - dev_info(adev->dev, "GPU reset for %s RAS poison consumption > is issued!\n", > - block_obj->ras_comm.name); > + event_id = amdgpu_ras_acquire_event_id(adev, type); > + RAS_EVENT_LOG(adev, event_id, > + "GPU reset for %s RAS poison consumption > + is > issued!\n", > + block_obj->ras_comm.name); > amdgpu_ras_reset_gpu(adev); > } > > @@ -2498,7 +2507,7 @@ static enum ras_event_type > amdgpu_ras_get_recovery_event(struct amdgpu_device *a > if (amdgpu_ras_intr_triggered()) > return RAS_EVENT_TYPE_ISR; > else > - return RAS_EVENT_TYPE_INVALID; > + return RAS_EVENT_TYPE_POISON_CONSUMPTION; > } > > static void amdgpu_ras_do_recovery(struct work_struct *work) @@ > -3975,6 > +3984,7 @@ u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, > enum ras_event_type > switch (type) { > case RAS_EVENT_TYPE_ISR: > case RAS_EVENT_TYPE_POISON_CREATION: > + case RAS_EVENT_TYPE_POISON_CONSUMPTION: > event_mgr = __get_ras_event_mgr(adev); > if (!event_mgr) > return RAS_EVENT_INVALID_ID; diff --git > a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index 1343cfbc913b..6086da67fa4e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -433,6 +433,7 @@ enum ras_event_type { > RAS_EVENT_TYPE_INVALID = 0, > RAS_EVENT_TYPE_ISR, > RAS_EVENT_TYPE_POISON_CREATION, > + RAS_EVENT_TYPE_POISON_CONSUMPTION, > RAS_EVENT_TYPE_COUNT, > }; > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > index 816800555f7f..8a10a0e42846 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > @@ -27,6 +27,7 @@ > #include "soc15_int.h" > #include "kfd_device_queue_manager.h" > #include "kfd_smi_events.h" > +#include "amdgpu_ras.h" > > /* > * GFX9 SQ Interrupts > @@ -144,9 +145,11 @@ static void > event_interrupt_poison_consumption_v9(struct kfd_node *dev, > uint16_t pasid, uint16_t client_id) { > enum amdgpu_ras_block block = 0; > - int old_poison; > uint32_t reset = 0; > struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); > + enum ras_event_type type = > RAS_EVENT_TYPE_POISON_CONSUMPTION; > + u64 event_id; > + int old_poison, ret; > > if (!p) > return; > @@ -191,10 +194,16 @@ static void > event_interrupt_poison_consumption_v9(struct kfd_node *dev, > return; > } > > + ret = amdgpu_ras_mark_ras_event(dev->adev, type); [Tao] I don't think it's necessary to mark it both in amdgpu_ras_interrupt_poison_consumption_handler and here. Another question is, event_interrupt_poison_consumption_v9 is used by several ASICs, are you sure the event id code doesn't affect old ASICs? [Kevin]: The event id is only used for format the RAS event log, and it won't affect the code logic for old Asics. Best Regards, Kevin > + if (ret) > + return; > + > kfd_signal_poison_consumed_event(dev, pasid); > > - dev_warn(dev->adev->dev, > - "poison is consumed by client %d, kick off gpu reset flow\n", > client_id); > + event_id = amdgpu_ras_acquire_event_id(dev->adev, type); > + > + RAS_EVENT_LOG(dev->adev, event_id, > + "poison is consumed by client %d, kick off gpu > +reset flow\n", client_id); > > amdgpu_amdkfd_ras_pasid_poison_consumption_handler(dev->adev, > block, pasid, NULL, NULL, reset); > -- > 2.34.1