Re: [PATCH 1/1] drm/amdkfd: Don't touch the hardware in pre_reset callback

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



[+Andrey]

Hi Shaoyun, Monk, and Andrey,

I tested this on my bare-metal Vega10 system. GPU reset (using BACO) is flaky on this system with and without this patch. The first reset seems to work OK, the second one fails in different ways. In theory this change should be an improvement as it eliminated at least one failure mode (hanging before the actual reset). A least I don't think I'm making anything worse.

My test is to run KFDTest and trigger a GPU reset while it's busy with the eviction test. This test runs multiple KFD processes and also performs graphics memory allocations and command submissions to force TTM to evict KFD processes. I trigger the reset by hanging the HWS through a debugfs hook:

    # cat /sys/class/kfd/kfd/topology/nodes/1/gpu_id > /sys/kernel/debug/kfd/hang_hws

Please give this patch a try and reinstate the pre_reset call before the FLR for SRIOV.

Regards,
  Felix

On 2019-12-19 9:09 p.m., Felix Kuehling wrote:
The intention of the pre_reset callback is to update the driver
state to reflect that all user mode queues are preempted and the
HIQ is destroyed. However we should not actually preempt any queues
or otherwise touch the hardware because it is presumably hanging.
The impending reset will take care of actually stopping all queues.

This should prevent KIQ register access hanging on SRIOV function
level reset (FLR). It should also speed up the reset by avoiding
unnecessary timeouts from a potentially hanging GPU scheduler.

CC: shaoyunl <shaoyun.liu@xxxxxxx>
CC: Liu Monk <Monk.Liu@xxxxxxx>
Signed-off-by: Felix Kuehling <Felix.Kuehling@xxxxxxx>
---
  drivers/gpu/drm/amd/amdkfd/kfd_device.c       | 24 ++++++++++-------
  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 27 ++++++++++++-------
  .../drm/amd/amdkfd/kfd_device_queue_manager.h |  5 ++--
  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c |  8 +++---
  .../gpu/drm/amd/amdkfd/kfd_packet_manager.c   |  4 +--
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  8 +++---
  drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 11 ++++----
  .../amd/amdkfd/kfd_process_queue_manager.c    |  2 +-
  8 files changed, 53 insertions(+), 36 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index c6b6901bbda3..796996a0d926 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -486,6 +486,7 @@ static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
  				unsigned int chunk_size);
  static void kfd_gtt_sa_fini(struct kfd_dev *kfd);
+static void kfd_suspend(struct kfd_dev *kfd, bool pre_reset);
  static int kfd_resume(struct kfd_dev *kfd);
struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
@@ -728,7 +729,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
  {
  	if (!kfd->init_complete)
  		return 0;
-	kgd2kfd_suspend(kfd);
+	kfd_suspend(kfd, true);
kfd_signal_reset_event(kfd);
  	return 0;
@@ -767,13 +768,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)
  	if (!kfd->init_complete)
  		return;
- /* For first KFD device suspend all the KFD processes */
-	if (atomic_inc_return(&kfd_locked) == 1)
-		kfd_suspend_all_processes();
-
-	kfd->dqm->ops.stop(kfd->dqm);
-
-	kfd_iommu_suspend(kfd);
+	kfd_suspend(kfd, false);
  }
int kgd2kfd_resume(struct kfd_dev *kfd)
@@ -795,6 +790,17 @@ int kgd2kfd_resume(struct kfd_dev *kfd)
  	return ret;
  }
+static void kfd_suspend(struct kfd_dev *kfd, bool pre_reset)
+{
+	/* For first KFD device suspend all the KFD processes */
+	if (atomic_inc_return(&kfd_locked) == 1)
+		kfd_suspend_all_processes(pre_reset);
+
+	kfd->dqm->ops.stop(kfd->dqm, pre_reset);
+
+	kfd_iommu_suspend(kfd);
+}
+
  static int kfd_resume(struct kfd_dev *kfd)
  {
  	int err = 0;
@@ -877,7 +883,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
  	if (!p)
  		return -ESRCH;
- r = kfd_process_evict_queues(p);
+	r = kfd_process_evict_queues(p, false);
kfd_unref_process(p);
  	return r;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index f7f6df40875e..3a49944164da 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -592,7 +592,8 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
  }
static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
-					struct qcm_process_device *qpd)
+					struct qcm_process_device *qpd,
+					bool pre_reset)
  {
  	struct queue *q;
  	struct mqd_manager *mqd_mgr;
@@ -622,6 +623,8 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
if (WARN_ONCE(!dqm->sched_running, "Evict when stopped\n"))
  			continue;
+		if (pre_reset)
+			continue;
retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
  				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
@@ -639,7 +642,8 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
  }
static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
-				      struct qcm_process_device *qpd)
+				      struct qcm_process_device *qpd,
+				      bool pre_reset)
  {
  	struct queue *q;
  	struct kfd_process_device *pdd;
@@ -664,6 +668,10 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
  		q->properties.is_active = false;
  		dqm->queue_count--;
  	}
+
+	if (pre_reset)
+		goto out;
+
  	retval = execute_queues_cpsch(dqm,
  				qpd->is_debug ?
  				KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
@@ -944,10 +952,10 @@ static int start_nocpsch(struct device_queue_manager *dqm)
  	return 0;
  }
-static int stop_nocpsch(struct device_queue_manager *dqm)
+static int stop_nocpsch(struct device_queue_manager *dqm, bool pre_reset)
  {
  	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
-		pm_uninit(&dqm->packets);
+		pm_uninit(&dqm->packets, pre_reset);
  	dqm->sched_running = false;
return 0;
@@ -1107,20 +1115,21 @@ static int start_cpsch(struct device_queue_manager *dqm)
  	return 0;
  fail_allocate_vidmem:
  fail_set_sched_resources:
-	pm_uninit(&dqm->packets);
+	pm_uninit(&dqm->packets, false);
  fail_packet_manager_init:
  	return retval;
  }
-static int stop_cpsch(struct device_queue_manager *dqm)
+static int stop_cpsch(struct device_queue_manager *dqm, bool pre_reset)
  {
  	dqm_lock(dqm);
-	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+	if (!pre_reset)
+		unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
  	dqm->sched_running = false;
  	dqm_unlock(dqm);
kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
-	pm_uninit(&dqm->packets);
+	pm_uninit(&dqm->packets, pre_reset);
return 0;
  }
@@ -1891,7 +1900,7 @@ int kfd_process_vm_fault(struct device_queue_manager *dqm,
  		return -EINVAL;
  	pdd = kfd_get_process_device_data(dqm->dev, p);
  	if (pdd)
-		ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
+		ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd, false);
  	kfd_unref_process(p);
return ret;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index a8c37e6da027..9f82f95f6a92 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -103,7 +103,7 @@ struct device_queue_manager_ops {
int (*initialize)(struct device_queue_manager *dqm);
  	int	(*start)(struct device_queue_manager *dqm);
-	int	(*stop)(struct device_queue_manager *dqm);
+	int	(*stop)(struct device_queue_manager *dqm, bool pre_reset);
  	void	(*uninitialize)(struct device_queue_manager *dqm);
  	int	(*create_kernel_queue)(struct device_queue_manager *dqm,
  					struct kernel_queue *kq,
@@ -129,7 +129,8 @@ struct device_queue_manager_ops {
  			struct qcm_process_device *qpd);
int (*evict_process_queues)(struct device_queue_manager *dqm,
-				    struct qcm_process_device *qpd);
+				    struct qcm_process_device *qpd,
+				    bool pre_reset);
  	int (*restore_process_queues)(struct device_queue_manager *dqm,
  				      struct qcm_process_device *qpd);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 2d56dc534459..dbd2e8e9ae69 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_dev *dev,
  }
/* Uninitialize a kernel queue and free all its memory usages. */
-static void kq_uninitialize(struct kernel_queue *kq)
+static void kq_uninitialize(struct kernel_queue *kq, bool pre_reset)
  {
-	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
+	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !pre_reset)
  		kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
  					kq->queue->mqd,
  					KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
@@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
  	return NULL;
  }
-void kernel_queue_uninit(struct kernel_queue *kq)
+void kernel_queue_uninit(struct kernel_queue *kq, bool pre_reset)
  {
-	kq_uninitialize(kq);
+	kq_uninitialize(kq, pre_reset);
  	kfree(kq);
  }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index 6cabed06ef5d..7732a3bbebd6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
  	return 0;
  }
-void pm_uninit(struct packet_manager *pm)
+void pm_uninit(struct packet_manager *pm, bool pre_reset)
  {
  	mutex_destroy(&pm->lock);
-	kernel_queue_uninit(pm->priv_queue);
+	kernel_queue_uninit(pm->priv_queue, pre_reset);
  }
int pm_send_set_resources(struct packet_manager *pm,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 087e96838997..9bc4ced4acba 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -762,9 +762,9 @@ struct kfd_process *kfd_get_process(const struct task_struct *);
  struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);
  struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
  void kfd_unref_process(struct kfd_process *p);
-int kfd_process_evict_queues(struct kfd_process *p);
+int kfd_process_evict_queues(struct kfd_process *p, bool pre_reset);
  int kfd_process_restore_queues(struct kfd_process *p);
-void kfd_suspend_all_processes(void);
+void kfd_suspend_all_processes(bool pre_reset);
  int kfd_resume_all_processes(void);
int kfd_process_device_init_vm(struct kfd_process_device *pdd,
@@ -883,7 +883,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);
  void device_queue_manager_uninit(struct device_queue_manager *dqm);
  struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
  					enum kfd_queue_type type);
-void kernel_queue_uninit(struct kernel_queue *kq);
+void kernel_queue_uninit(struct kernel_queue *kq, bool pre_reset);
  int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
/* Process Queue Manager */
@@ -974,7 +974,7 @@ extern const struct packet_manager_funcs kfd_vi_pm_funcs;
  extern const struct packet_manager_funcs kfd_v9_pm_funcs;
int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
-void pm_uninit(struct packet_manager *pm);
+void pm_uninit(struct packet_manager *pm, bool pre_reset);
  int pm_send_set_resources(struct packet_manager *pm,
  				struct scheduling_resources *res);
  int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 536a153ac9a4..7bcadd3a1e3b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -948,7 +948,7 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm)
   * Eviction is reference-counted per process-device. This means multiple
   * evictions from different sources can be nested safely.
   */
-int kfd_process_evict_queues(struct kfd_process *p)
+int kfd_process_evict_queues(struct kfd_process *p, bool pre_reset)
  {
  	struct kfd_process_device *pdd;
  	int r = 0;
@@ -956,7 +956,8 @@ int kfd_process_evict_queues(struct kfd_process *p)
list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
  		r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
-							    &pdd->qpd);
+							    &pdd->qpd,
+							    pre_reset);
  		if (r) {
  			pr_err("Failed to evict process queues\n");
  			goto fail;
@@ -1026,7 +1027,7 @@ static void evict_process_worker(struct work_struct *work)
  	flush_delayed_work(&p->restore_work);
pr_debug("Started evicting pasid 0x%x\n", p->pasid);
-	ret = kfd_process_evict_queues(p);
+	ret = kfd_process_evict_queues(p, false);
  	if (!ret) {
  		dma_fence_signal(p->ef);
  		dma_fence_put(p->ef);
@@ -1082,7 +1083,7 @@ static void restore_process_worker(struct work_struct *work)
  		pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid);
  }
-void kfd_suspend_all_processes(void)
+void kfd_suspend_all_processes(bool pre_reset)
  {
  	struct kfd_process *p;
  	unsigned int temp;
@@ -1092,7 +1093,7 @@ void kfd_suspend_all_processes(void)
  		cancel_delayed_work_sync(&p->eviction_work);
  		cancel_delayed_work_sync(&p->restore_work);
- if (kfd_process_evict_queues(p))
+		if (kfd_process_evict_queues(p, pre_reset))
  			pr_err("Failed to suspend process 0x%x\n", p->pasid);
  		dma_fence_signal(p->ef);
  		dma_fence_put(p->ef);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index d3eacf72e8db..8fa856e6a03f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -374,7 +374,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
  		/* destroy kernel queue (DIQ) */
  		dqm = pqn->kq->dev->dqm;
  		dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
-		kernel_queue_uninit(pqn->kq);
+		kernel_queue_uninit(pqn->kq, false);
  	}
if (pqn->q) {
_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux