Re: [PATCH 1/2] drm/amdkfd: support per-queue reset on gfx9

Felix Kuehling <felix.kuehling@xxxxxxx> · Tue, 30 Jul 2024 18:06:42 -0400

On 2024-07-26 11:30, Jonathan Kim wrote:
> Support per-queue reset for GFX9.  The recommendation is for the driver
> to target reset the HW queue via a SPI MMIO register write.
> 
> Since this requires pipe and HW queue info and MEC FW is limited to
> doorbell reports of hung queues after an unmap failure, scan the HW
> queue slots defined by SET_RESOURCES first to identify the user queue
> candidates to reset.
> 
> Only signal reset events to processes that have had a queue reset.
> 
> If queue reset fails, fall back to GPU reset.
> 
> v2: move reset queue flag for house keeping to process device.
> split detect and reset into separate functions.
> make reset call safe during power saving modes.
> clean up some other nitpicks.

Some more nit-picks inline.

> 
> Signed-off-by: Jonathan Kim <jonathan.kim@xxxxxxx>
> ---
>  .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |   2 +
>  .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |   4 +-
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c   |   4 +-
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    |  16 ++
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    |   9 +
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |   4 +-
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c    |  18 +-
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  85 +++++++++
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |   9 +
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 172 +++++++++++++++++-
>  .../drm/amd/amdkfd/kfd_device_queue_manager.h |  12 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_events.c       |  21 +++
>  .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |   6 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   3 +
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   2 +
>  .../gpu/drm/amd/include/kgd_kfd_interface.h   |   6 +
>  16 files changed, 360 insertions(+), 13 deletions(-)
> 
[snip]
> @@ -1680,6 +1700,14 @@ static int start_cpsch(struct device_queue_manager *dqm)
>  					&dqm->wait_times);
>  	}
>  
> +	/* setup per-queue reset detection buffer  */
> +	num_hw_queue_slots =  dqm->dev->kfd->shared_resources.num_queue_per_pipe *
> +			      dqm->dev->kfd->shared_resources.num_pipe_per_mec *
> +			      NUM_XCC(dqm->dev->xcc_mask);
> +
> +	dqm->detect_hang_info_size = num_hw_queue_slots * sizeof(struct dqm_detect_hang_info);
> +	dqm->detect_hang_info = kzalloc(dqm->detect_hang_info_size, GFP_KERNEL);

You need to check the return value and handle allocation failures.

> +
>  	dqm_unlock(dqm);
>  
>  	return 0;
> @@ -1713,6 +1741,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
>  	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>  	if (!dqm->dev->kfd->shared_resources.enable_mes)
>  		pm_uninit(&dqm->packet_mgr);
> +	kfree(dqm->detect_hang_info);

Reset dqm->detect_hang_info to NULL to avoid a dangling pointer.

>  	dqm_unlock(dqm);
>  
>  	return 0;
> @@ -1929,6 +1958,131 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
>  	return retval;
>  }
>  
> +static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q,
> +			       struct qcm_process_device *qpd)
> +{
> +	struct kfd_process_device *pdd = qpd_to_pdd(qpd);
> +
> +	pr_err("queue id 0x%0x at pasid 0x%0x is reset\n",
> +	       q->properties.queue_id, q->process->pasid);
> +
> +	pdd->has_reset_queue = true;
> +	if (q->properties.is_active) {
> +		q->properties.is_active = false;
> +		decrement_queue_count(dqm, qpd, q);
> +	}
> +}
> +
> +static int detect_queue_hang(struct device_queue_manager *dqm)
> +{
> +	int i;
> +
> +	memset(dqm->detect_hang_info, 0, dqm->detect_hang_info_size);

Set dqm->detect_hang_count to 0 to avoid overflows in case multiple hand detections get kicked off. Or if that's not possible, just print a WARN_ON if detect_hang_count is not 0 and return.

> +
> +	for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) {
> +		uint32_t mec, pipe, queue;
[snip]
> @@ -1244,12 +1245,32 @@ void kfd_signal_reset_event(struct kfd_node *dev)
>  	idx = srcu_read_lock(&kfd_processes_srcu);
>  	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
>  		int user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
> +		struct kfd_process_device *pdd = kfd_get_process_device_data(dev, p);
>  
>  		if (unlikely(user_gpu_id == -EINVAL)) {
>  			WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
>  			continue;
>  		}
>  
> +		if (unlikely(!pdd)) {
> +			WARN_ONCE(1, "Could not get device data from pasid:0x%x\n", p->pasid);
> +			continue;
> +		}
> +
> +		if (dev->dqm->detect_hang_count && !pdd->has_reset_queue)
> +			continue;
> +
> +		if (dev->dqm->detect_hang_count) {
> +			struct amdgpu_task_info *ti;
> +
> +			ti = amdgpu_vm_get_task_info_pasid(dev->adev, p->pasid);
> +			if (ti) {
> +				DRM_ERROR("Process info: process %s tid %d thread %s pid %d\n",
> +				ti->process_name, ti->tgid, ti->task_name, ti->pid);

This looks like debug code that's not meant to be checked in. If you mean to check it in, the message needs more information. Use dev_err or dev_info to print the affected GPU and put something about a queue reset into the message.

Regards,
  Felix

> +				amdgpu_vm_put_task_info(ti);
> +			}
> +		}
> +
>  		rcu_read_lock();
>  
>  		id = KFD_FIRST_NONSIGNAL_EVENT_ID;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> index 66c73825c0a0..84e8ea3a8a0c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> @@ -321,8 +321,11 @@ static void update_mqd(struct mqd_manager *mm, void *mqd,
>  static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)
>  {
>  	struct v9_mqd *m = (struct v9_mqd *)mqd;
> +	uint32_t doorbell_id = m->queue_doorbell_id0;
>  
> -	return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0);
> +	m->queue_doorbell_id0 = 0;
> +
> +	return kfd_check_hiq_mqd_doorbell_id(mm->dev, doorbell_id, 0);
>  }
>  
>  static int get_wave_state(struct mqd_manager *mm, void *mqd,
> @@ -624,6 +627,7 @@ static bool check_preemption_failed_v9_4_3(struct mqd_manager *mm, void *mqd)
>  		m = get_mqd(mqd + hiq_mqd_size * inst);
>  		ret |= kfd_check_hiq_mqd_doorbell_id(mm->dev,
>  					m->queue_doorbell_id0, inst);
> +		m->queue_doorbell_id0 = 0;
>  		++inst;
>  	}
>  
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index b5cae48dff66..892a85408c09 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -843,6 +843,9 @@ struct kfd_process_device {
>  	void *proc_ctx_bo;
>  	uint64_t proc_ctx_gpu_addr;
>  	void *proc_ctx_cpu_ptr;
> +
> +	/* Tracks queue reset status */
> +	bool has_reset_queue;
>  };
>  
>  #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 9e29b92eb523..a902950cc060 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1851,6 +1851,8 @@ int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger)
>  			goto fail;
>  		}
>  		n_evicted++;
> +
> +		pdd->dev->dqm->is_hws_hang = false;
>  	}
>  
>  	return r;
> diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> index 6d094cf3587d..7744ca3ef4b1 100644
> --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> @@ -318,6 +318,12 @@ struct kfd2kgd_calls {
>  	void (*program_trap_handler_settings)(struct amdgpu_device *adev,
>  			uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr,
>  			uint32_t inst);
> +	uint64_t (*hqd_get_pq_addr)(struct amdgpu_device *adev,
> +				    uint32_t pipe_id, uint32_t queue_id,
> +				    uint32_t inst);
> +	uint64_t (*hqd_reset)(struct amdgpu_device *adev,
> +			      uint32_t pipe_id, uint32_t queue_id,
> +			      uint32_t inst, unsigned int utimeout);
>  };
>  
>  #endif	/* KGD_KFD_INTERFACE_H_INCLUDED */