[AMD Official Use Only] Copy Felix @@ -119,10 +121,14 @@ static void event_interrupt_poison_consumption(struct kfd_dev *dev, break; case SOC15_INTSRC_SDMA_ECC: default: + if (client_id == SOC15_IH_CLIENTID_UTCL2) + ret = kfd_dqm_evict_pasid(dev->dqm, pasid); break; } This will break SDMA - We haven't enabled optimized poison consumption handling for sdma yet. I'd suggest we explicitly call out the interrupt source id UTCL2_FAULT as a case, even it is the same as VM_FAULT. And it should be fine to start evict_queue directly after that because in ISR it already guarantee this is from UTCL2 client, right? + if (client_id == SOC15_IH_CLIENTID_UTCL2 && + dev->kfd2kgd->is_ras_utcl2_poison && + dev->kfd2kgd->is_ras_utcl2_poison(dev->adev, client_id)) { + event_interrupt_poison_consumption(dev, ih_ring_entry); In addition, is_ras_utcl2_poison can be renamed to query_utcl2_ras_status or poison_status, while utcl2_fault_clear to reset_utlc2_poison_status to align with naming style of ras hw op. Thinking about this more, it's better we add this in gfx ras op, and expose to KFD. Thoughts? Regards, Hawking -----Original Message----- From: Zhou1, Tao <Tao.Zhou1@xxxxxxx> Sent: Monday, March 14, 2022 15:03 To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx> Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx> Subject: [PATCH 3/3] drm/amdkfd: add RAS poison consumption support for utcl2 Do RAS page retirement and use gpu reset as fallback in utcl2 fault handler. Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> --- .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index f7def0bf0730..3991f71d865b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -93,11 +93,12 @@ enum SQ_INTERRUPT_ERROR_TYPE { static void event_interrupt_poison_consumption(struct kfd_dev *dev, const uint32_t *ih_ring_entry) { - uint16_t source_id, pasid; + uint16_t source_id, client_id, pasid; int ret = -EINVAL; struct kfd_process *p; source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); + client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); p = kfd_lookup_process_by_pasid(pasid); @@ -110,6 +111,7 @@ static void event_interrupt_poison_consumption(struct kfd_dev *dev, return; } + pr_debug("RAS poison consumption handling\n"); atomic_set(&p->poison, 1); kfd_unref_process(p); @@ -119,10 +121,14 @@ static void event_interrupt_poison_consumption(struct kfd_dev *dev, break; case SOC15_INTSRC_SDMA_ECC: default: + if (client_id == SOC15_IH_CLIENTID_UTCL2) + ret = kfd_dqm_evict_pasid(dev->dqm, pasid); break; } - kfd_signal_poison_consumed_event(dev, pasid); + /* utcl2 page fault has its own vm fault event */ + if (client_id != SOC15_IH_CLIENTID_UTCL2) + kfd_signal_poison_consumed_event(dev, pasid); /* resetting queue passes, do page retirement without gpu reset * resetting queue fails, fallback to gpu reset solution @@ -314,7 +320,18 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, info.prot_write = ring_id & 0x20; kfd_smi_event_update_vmfault(dev, pasid); - kfd_dqm_evict_pasid(dev->dqm, pasid); + + if (client_id == SOC15_IH_CLIENTID_UTCL2 && + dev->kfd2kgd->is_ras_utcl2_poison && + dev->kfd2kgd->is_ras_utcl2_poison(dev->adev, client_id)) { + event_interrupt_poison_consumption(dev, ih_ring_entry); + + if (dev->kfd2kgd->utcl2_fault_clear) + dev->kfd2kgd->utcl2_fault_clear(dev->adev); + } + else + kfd_dqm_evict_pasid(dev->dqm, pasid); + kfd_signal_vm_fault_event(dev, pasid, &info); } } -- 2.35.1