Re: [PATCH 3/3] drm/amdkfd: add RAS poison consumption support for utcl2

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





On 3/14/2022 12:33 PM, Tao Zhou wrote:
Do RAS page retirement and use gpu reset as fallback in utcl2
fault handler.

Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx>
---
  .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 23 ++++++++++++++++---
  1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index f7def0bf0730..3991f71d865b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -93,11 +93,12 @@ enum SQ_INTERRUPT_ERROR_TYPE {
  static void event_interrupt_poison_consumption(struct kfd_dev *dev,
  				const uint32_t *ih_ring_entry)
  {
-	uint16_t source_id, pasid;
+	uint16_t source_id, client_id, pasid;
  	int ret = -EINVAL;
  	struct kfd_process *p;
source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
+	client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
  	pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
p = kfd_lookup_process_by_pasid(pasid);
@@ -110,6 +111,7 @@ static void event_interrupt_poison_consumption(struct kfd_dev *dev,
  		return;
  	}
+ pr_debug("RAS poison consumption handling\n");

dev is available through kfd_dev.

  	atomic_set(&p->poison, 1);
  	kfd_unref_process(p);
@@ -119,10 +121,14 @@ static void event_interrupt_poison_consumption(struct kfd_dev *dev,
  		break;
  	case SOC15_INTSRC_SDMA_ECC:
  	default:
+		if (client_id == SOC15_IH_CLIENTID_UTCL2)
+			ret = kfd_dqm_evict_pasid(dev->dqm, pasid);

Since this doesn't logically belong to the switch condition, better to keep it outside of switch.

  		break;
  	}
- kfd_signal_poison_consumed_event(dev, pasid);
+	/* utcl2 page fault has its own vm fault event */
+	if (client_id != SOC15_IH_CLIENTID_UTCL2)
+		kfd_signal_poison_consumed_event(dev, pasid);
/* resetting queue passes, do page retirement without gpu reset
  	 * resetting queue fails, fallback to gpu reset solution
@@ -314,7 +320,18 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
  		info.prot_write = ring_id & 0x20;
kfd_smi_event_update_vmfault(dev, pasid);
-		kfd_dqm_evict_pasid(dev->dqm, pasid);
+
+		if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
+		    dev->kfd2kgd->is_ras_utcl2_poison &&
+		    dev->kfd2kgd->is_ras_utcl2_poison(dev->adev, client_id)) {
+			event_interrupt_poison_consumption(dev, ih_ring_entry);
+
Is it expected that no other interrupt would come until this FED error is cleared? Otherwise subsequent ones could also be treated as poison.

Basically, whether to do this or not?
	1) Clear FED
	2) Handle poison consumption


Thanks,
Lijo

+			if (dev->kfd2kgd->utcl2_fault_clear)
+				dev->kfd2kgd->utcl2_fault_clear(dev->adev);
+		}
+		else
+			kfd_dqm_evict_pasid(dev->dqm, pasid);
+
  		kfd_signal_vm_fault_event(dev, pasid, &info);
  	}
  }




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux