Re: [PATCH] drm/amdkfd: support per-queue reset on gfx9

Felix Kuehling <felix.kuehling@xxxxxxx> · Wed, 31 Jul 2024 11:44:35 -0400

On 2024-07-31 09:37, Jonathan Kim wrote:
Support per-queue reset for GFX9.  The recommendation is for the driver
to target reset the HW queue via a SPI MMIO register write.

Since this requires pipe and HW queue info and MEC FW is limited to
doorbell reports of hung queues after an unmap failure, scan the HW
queue slots defined by SET_RESOURCES first to identify the user queue
candidates to reset.

Only signal reset events to processes that have had a queue reset.

If queue reset fails, fall back to GPU reset.

v3: address nitpicks
- handle hang detect buffer ENOMEM
- warn on multiple detect hang misuse
- reset hang detect buffer to NULL on free
- update DRM_ERR on reset to drm_err app warning message

I meant dev_err here to make sure we print the device identifier. That's 
what we mostly use in KFD. If drm_err does the same, that's fine, too. 
Looking at the definitions in drm_print.h, the only thing that drm_err 
adds is a "[drm]" tag in the message.

See one more comment inline.



v2: move reset queue flag for house keeping to process device.
split detect and reset into separate functions.
make reset call safe during power saving modes.
clean up some other nitpicks.

Signed-off-by: Jonathan Kim <jonathan.kim@xxxxxxx>

[snip]

@@ -1929,6 +1966,135 @@ static int map_queues_cpsch(struct 
device_queue_manager *dqm)
  	return retval;
  }
  
+static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q,
+			       struct qcm_process_device *qpd)
+{
+	struct kfd_process_device *pdd = qpd_to_pdd(qpd);
+
+	pr_err("queue id 0x%0x at pasid 0x%0x is reset\n",
+	       q->properties.queue_id, q->process->pasid);

This could also be a dev_err(dqm->dev->adev->dev, ...) or 
drm_err(dqm->dev->adev->ddev, ...). With that fixed, the patch is

Reviewed-by: Felix Kuehling <felix.kuehling@xxxxxxx>


+
+	pdd->has_reset_queue = true;
+	if (q->properties.is_active) {
+		q->properties.is_active = false;
+		decrement_queue_count(dqm, qpd, q);
+	}
+}
+
+static int detect_queue_hang(struct device_queue_manager *dqm)
+{
+	int i;
+
+	/* detect should be used only in dqm locked queue reset */
+	if (WARN_ON(dqm->detect_hang_count > 0))
+		return 0;
+
+	memset(dqm->detect_hang_info, 0, dqm->detect_hang_info_size);
+
+	for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) {
+		uint32_t mec, pipe, queue;
+		int xcc_id;
+
+		mec = (i / dqm->dev->kfd->shared_resources.num_queue_per_pipe)
+			/ dqm->dev->kfd->shared_resources.num_pipe_per_mec;
+
+		if (mec || !test_bit(i, dqm->dev->kfd->shared_resources.cp_queue_bitmap))
+			continue;
+
+		amdgpu_queue_mask_bit_to_mec_queue(dqm->dev->adev, i, &mec, &pipe, &queue);
+
+		for_each_inst(xcc_id, dqm->dev->xcc_mask) {
+			uint64_t queue_addr = dqm->dev->kfd2kgd->hqd_get_pq_addr(
+						dqm->dev->adev, pipe, queue, xcc_id);
+			struct dqm_detect_hang_info hang_info;
+
+			if (!queue_addr)
+				continue;
+
+			hang_info.pipe_id = pipe;
+			hang_info.queue_id = queue;
+			hang_info.xcc_id = xcc_id;
+			hang_info.queue_address = queue_addr;
+
+			dqm->detect_hang_info[dqm->detect_hang_count] = hang_info;
+			dqm->detect_hang_count++;
+		}
+	}
+
+	return dqm->detect_hang_count;
+}
+
+static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uint64_t queue_address)
+{
+	struct device_process_node *cur;
+	struct qcm_process_device *qpd;
+	struct queue *q;
+
+	list_for_each_entry(cur, &dqm->queues, list) {
+		qpd = cur->qpd;
+		list_for_each_entry(q, &qpd->queues_list, list) {
+			if (queue_address == q->properties.queue_address)
+				return q;
+		}
+	}
+
+	return NULL;
+}
+
+/* only for compute queue */
+static int reset_queues_on_hws_hang(struct device_queue_manager *dqm)
+{
+	int r = 0, reset_count = 0, i;
+
+	if (!dqm->detect_hang_info || dqm->is_hws_hang)
+		return -EIO;
+
+	/* assume dqm locked. */
+	if (!detect_queue_hang(dqm))
+		return -ENOTRECOVERABLE;
+
+	for (i = 0; i < dqm->detect_hang_count; i++) {
+		struct dqm_detect_hang_info hang_info = dqm->detect_hang_info[i];
+		struct queue *q = find_queue_by_address(dqm, hang_info.queue_address);
+		struct kfd_process_device *pdd;
+		uint64_t queue_addr = 0;
+
+		if (!q) {
+			r = -ENOTRECOVERABLE;
+			goto reset_fail;
+		}
+
+		pdd = kfd_get_process_device_data(dqm->dev, q->process);
+		if (!pdd) {
+			r = -ENOTRECOVERABLE;
+			goto reset_fail;
+		}
+
+		queue_addr = dqm->dev->kfd2kgd->hqd_reset(dqm->dev->adev,
+				hang_info.pipe_id, hang_info.queue_id, hang_info.xcc_id,
+				KFD_UNMAP_LATENCY_MS);
+
+		/* either reset failed or we reset an unexpected queue. */
+		if (queue_addr != q->properties.queue_address) {
+			r = -ENOTRECOVERABLE;
+			goto reset_fail;
+		}
+
+		set_queue_as_reset(dqm, q, &pdd->qpd);
+		reset_count++;
+	}
+
+	if (reset_count == dqm->detect_hang_count)
+		kfd_signal_reset_event(dqm->dev);
+	else
+		r = -ENOTRECOVERABLE;
+
+reset_fail:
+	dqm->detect_hang_count = 0;
+
+	return r;
+}
+
  /* dqm->lock mutex has to be locked before calling this function */
  static int unmap_queues_cpsch(struct device_queue_manager *dqm,
  				enum kfd_unmap_queues_filter filter,
@@ -1979,11 +2145,14 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
  	 */
  	mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ];
  	if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd)) {
-		while (halt_if_hws_hang)
-			schedule();
-		kfd_hws_hang(dqm);
-		retval = -ETIME;
-		goto out;
+		if (reset_queues_on_hws_hang(dqm)) {
+			while (halt_if_hws_hang)
+				schedule();
+			dqm->is_hws_hang = true;
+			kfd_hws_hang(dqm);
+			retval = -ETIME;
+			goto out;
+		}
  	}
  
  	/* We need to reset the grace period value for this device */
@@ -2002,8 +2171,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
  }
  
  /* only for compute queue */
-static int reset_queues_cpsch(struct device_queue_manager *dqm,
-			uint16_t pasid)
+static int reset_queues_cpsch(struct device_queue_manager *dqm, uint16_t pasid)
  {
  	int retval;

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 3b9b8eabaacc..dfb36a246637 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -210,6 +210,13 @@ struct device_queue_manager_asic_ops {
  				 struct kfd_node *dev);
  };
  
+struct dqm_detect_hang_info {
+	int pipe_id;
+	int queue_id;
+	int xcc_id;
+	uint64_t queue_address;
+};
+
  /**
   * struct device_queue_manager
   *
@@ -264,6 +271,11 @@ struct device_queue_manager {
  	uint32_t		wait_times;
  
  	wait_queue_head_t	destroy_wait;
+
+	/* for per-queue reset support */
+	struct dqm_detect_hang_info *detect_hang_info;
+	size_t detect_hang_info_size;
+	int detect_hang_count;
  };
  
  void device_queue_manager_init_cik(
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 9b33d9d2c9ad..9393ddc2e828 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -31,6 +31,7 @@
  #include <linux/memory.h>
  #include "kfd_priv.h"
  #include "kfd_events.h"
+#include "kfd_device_queue_manager.h"
  #include <linux/device.h>
  
  /*
@@ -1244,12 +1245,33 @@ void kfd_signal_reset_event(struct kfd_node *dev)
  	idx = srcu_read_lock(&kfd_processes_srcu);
  	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
  		int user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
+		struct kfd_process_device *pdd = kfd_get_process_device_data(dev, p);
  
  		if (unlikely(user_gpu_id == -EINVAL)) {
  			WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
  			continue;
  		}
  
+		if (unlikely(!pdd)) {
+			WARN_ONCE(1, "Could not get device data from pasid:0x%x\n", p->pasid);
+			continue;
+		}
+
+		if (dev->dqm->detect_hang_count && !pdd->has_reset_queue)
+			continue;
+
+		if (dev->dqm->detect_hang_count) {
+			struct amdgpu_task_info *ti;
+
+			ti = amdgpu_vm_get_task_info_pasid(dev->adev, p->pasid);
+			if (ti) {
+				drm_err(&dev->adev->ddev,
+					"Queues reset on process %s tid %d thread %s pid %d\n",
+					ti->process_name, ti->tgid, ti->task_name, ti->pid);
+				amdgpu_vm_put_task_info(ti);
+			}
+		}
+
  		rcu_read_lock();
  
  		id = KFD_FIRST_NONSIGNAL_EVENT_ID;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
index 66c73825c0a0..84e8ea3a8a0c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -321,8 +321,11 @@ static void update_mqd(struct mqd_manager *mm, void *mqd,
  static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)
  {
  	struct v9_mqd *m = (struct v9_mqd *)mqd;
+	uint32_t doorbell_id = m->queue_doorbell_id0;
  
-	return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0);
+	m->queue_doorbell_id0 = 0;
+
+	return kfd_check_hiq_mqd_doorbell_id(mm->dev, doorbell_id, 0);
  }
  
  static int get_wave_state(struct mqd_manager *mm, void *mqd,
@@ -624,6 +627,7 @@ static bool check_preemption_failed_v9_4_3(struct mqd_manager *mm, void *mqd)
  		m = get_mqd(mqd + hiq_mqd_size * inst);
  		ret |= kfd_check_hiq_mqd_doorbell_id(mm->dev,
  					m->queue_doorbell_id0, inst);
+		m->queue_doorbell_id0 = 0;
  		++inst;
  	}
  
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index b5cae48dff66..892a85408c09 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -843,6 +843,9 @@ struct kfd_process_device {
  	void *proc_ctx_bo;
  	uint64_t proc_ctx_gpu_addr;
  	void *proc_ctx_cpu_ptr;
+
+	/* Tracks queue reset status */
+	bool has_reset_queue;
  };
  
  #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 9e29b92eb523..a902950cc060 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1851,6 +1851,8 @@ int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger)
  			goto fail;
  		}
  		n_evicted++;
+
+		pdd->dev->dqm->is_hws_hang = false;
  	}
  
  	return r;
diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index 6d094cf3587d..7744ca3ef4b1 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -318,6 +318,12 @@ struct kfd2kgd_calls {
  	void (*program_trap_handler_settings)(struct amdgpu_device *adev,
  			uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr,
  			uint32_t inst);
+	uint64_t (*hqd_get_pq_addr)(struct amdgpu_device *adev,
+				    uint32_t pipe_id, uint32_t queue_id,
+				    uint32_t inst);
+	uint64_t (*hqd_reset)(struct amdgpu_device *adev,
+			      uint32_t pipe_id, uint32_t queue_id,
+			      uint32_t inst, unsigned int utimeout);
  };
  
  #endif	/* KGD_KFD_INTERFACE_H_INCLUDED */