Re: [PATCH 3/3] drm/amdkfd: reset queue which consumes RAS poison

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 2021-12-10 6:15 a.m., Tao Zhou wrote:
CP supports unmap queue with reset mode which only destroys specific queue without affecting others.
Replacing whole gpu reset with reset queue mode for RAS poison consumption
saves much time, and we can also fallback to gpu reset solution if reset
queue fails.

Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  6 ++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  3 ++-
  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 14 ++++++++++
  .../drm/amd/amdkfd/kfd_device_queue_manager.h |  1 +
  .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 27 ++++++++++++++++---
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  2 ++
  6 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 46cf48b3904a..0bf09a94d944 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -721,13 +721,13 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev)
  	return adev->have_atomics_support;
  }
-void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev)
+void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset)
  {
  	struct ras_err_data err_data = {0, 0, 0, NULL};
/* CPU MCA will handle page retirement if connected_to_cpu is 1 */
  	if (!adev->gmc.xgmi.connected_to_cpu)
-		amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL);
-	else
+		amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
+	else if (reset)
  		amdgpu_amdkfd_gpu_reset(adev);
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index fcbc8a9c9e06..61f899e54fd5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -296,7 +296,8 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev,
  				      uint64_t *mmap_offset);
  int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
  				struct tile_config *config);
-void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev);
+void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
+				bool reset);
  #if IS_ENABLED(CONFIG_HSA_AMD)
  void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
  void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 01a2cc3928ac..095b2e0822aa 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1476,6 +1476,20 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
  	return retval;
  }
+int unmap_queues_cpsch_poison(struct device_queue_manager *dqm, uint32_t pasid)
+{
+	int ret;
+
+	dqm_lock(dqm);
+
+	ret = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_BY_PASID,
+			pasid, true);
+
+	dqm_unlock(dqm);
+
+	return ret;
+}
+
  /* dqm->lock mutex has to be locked before calling this function */
  static int execute_queues_cpsch(struct device_queue_manager *dqm,
  				enum kfd_unmap_queues_filter filter,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 499fc0ea387f..c52869133159 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -219,6 +219,7 @@ unsigned int get_queues_per_pipe(struct device_queue_manager *dqm);
  unsigned int get_pipes_per_mec(struct device_queue_manager *dqm);
  unsigned int get_num_sdma_queues(struct device_queue_manager *dqm);
  unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm);
+int unmap_queues_cpsch_poison(struct device_queue_manager *dqm, uint32_t pasid);
static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
  {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index deb64168c9e8..2863bb9e5bca 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -89,6 +89,27 @@ enum SQ_INTERRUPT_ERROR_TYPE {
  #define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000
  #define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
+static void event_interrupt_poison_consumption(struct kfd_dev *dev,
+				uint16_t pasid)
+{
+	int ret;
+	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+
+	/* all queues of a process will be unmapped in one time */
+	if (p && atomic_read(&p->poison))
+		return;
+
+	atomic_set(&p->poison, 1);

You're not checking p != NULL here.

You also need to release the process refcount before this function returns. Otherwise the process resources will be leaked. You can see leaked processes in /sys/class/kfd/kfd/proc. That directory should be empty after all KFD processes terminated.

Other than that, the series is

Acked-by: Felix Kuehling <Felix.Kuehling@xxxxxxx>


+	ret = unmap_queues_cpsch_poison(dev->dqm, pasid);
+	kfd_signal_poison_consumed_event(dev, pasid);
+	/* resetting queue passes, do page retirement without gpu reset
+	   resetting queue fails, fallback to gpu reset solution */
+	if (!ret)
+		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+	else
+		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+}
+
  static bool event_interrupt_isr_v9(struct kfd_dev *dev,
  					const uint32_t *ih_ring_entry,
  					uint32_t *patched_ihre,
@@ -230,8 +251,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
  					sq_intr_err);
  				if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
  					sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
-					kfd_signal_poison_consumed_event(dev, pasid);
-					amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
+					event_interrupt_poison_consumption(dev, pasid);
  					return;
  				}
  				break;
@@ -252,8 +272,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
  		if (source_id == SOC15_INTSRC_SDMA_TRAP) {
  			kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
  		} else if (source_id == SOC15_INTSRC_SDMA_ECC) {
-			kfd_signal_poison_consumed_event(dev, pasid);
-			amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
+			event_interrupt_poison_consumption(dev, pasid);
  			return;
  		}
  	} else if (client_id == SOC15_IH_CLIENTID_VMC ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 0c3f911e3bf4..ea68f3b3a4e9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -856,6 +856,8 @@ struct kfd_process {
  	struct svm_range_list svms;
bool xnack_enabled;
+
+	atomic_t poison;
  };
#define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */



[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux