On 2024-07-26 11:30, Jonathan Kim wrote: > Support per-queue reset for GFX9. The recommendation is for the driver > to target reset the HW queue via a SPI MMIO register write. > > Since this requires pipe and HW queue info and MEC FW is limited to > doorbell reports of hung queues after an unmap failure, scan the HW > queue slots defined by SET_RESOURCES first to identify the user queue > candidates to reset. > > Only signal reset events to processes that have had a queue reset. > > If queue reset fails, fall back to GPU reset. > > v2: move reset queue flag for house keeping to process device. > split detect and reset into separate functions. > make reset call safe during power saving modes. > clean up some other nitpicks. Some more nit-picks inline. > > Signed-off-by: Jonathan Kim <jonathan.kim@xxxxxxx> > --- > .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c | 2 + > .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c | 4 +- > .../drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c | 4 +- > .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c | 16 ++ > .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h | 9 + > .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c | 4 +- > .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c | 18 +- > .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 85 +++++++++ > .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h | 9 + > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 172 +++++++++++++++++- > .../drm/amd/amdkfd/kfd_device_queue_manager.h | 12 ++ > drivers/gpu/drm/amd/amdkfd/kfd_events.c | 21 +++ > .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 6 +- > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 + > drivers/gpu/drm/amd/amdkfd/kfd_process.c | 2 + > .../gpu/drm/amd/include/kgd_kfd_interface.h | 6 + > 16 files changed, 360 insertions(+), 13 deletions(-) > [snip] > @@ -1680,6 +1700,14 @@ static int start_cpsch(struct device_queue_manager *dqm) > &dqm->wait_times); > } > > + /* setup per-queue reset detection buffer */ > + num_hw_queue_slots = dqm->dev->kfd->shared_resources.num_queue_per_pipe * > + dqm->dev->kfd->shared_resources.num_pipe_per_mec * > + NUM_XCC(dqm->dev->xcc_mask); > + > + dqm->detect_hang_info_size = num_hw_queue_slots * sizeof(struct dqm_detect_hang_info); > + dqm->detect_hang_info = kzalloc(dqm->detect_hang_info_size, GFP_KERNEL); You need to check the return value and handle allocation failures. > + > dqm_unlock(dqm); > > return 0; > @@ -1713,6 +1741,7 @@ static int stop_cpsch(struct device_queue_manager *dqm) > kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); > if (!dqm->dev->kfd->shared_resources.enable_mes) > pm_uninit(&dqm->packet_mgr); > + kfree(dqm->detect_hang_info); Reset dqm->detect_hang_info to NULL to avoid a dangling pointer. > dqm_unlock(dqm); > > return 0; > @@ -1929,6 +1958,131 @@ static int map_queues_cpsch(struct device_queue_manager *dqm) > return retval; > } > > +static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q, > + struct qcm_process_device *qpd) > +{ > + struct kfd_process_device *pdd = qpd_to_pdd(qpd); > + > + pr_err("queue id 0x%0x at pasid 0x%0x is reset\n", > + q->properties.queue_id, q->process->pasid); > + > + pdd->has_reset_queue = true; > + if (q->properties.is_active) { > + q->properties.is_active = false; > + decrement_queue_count(dqm, qpd, q); > + } > +} > + > +static int detect_queue_hang(struct device_queue_manager *dqm) > +{ > + int i; > + > + memset(dqm->detect_hang_info, 0, dqm->detect_hang_info_size); Set dqm->detect_hang_count to 0 to avoid overflows in case multiple hand detections get kicked off. Or if that's not possible, just print a WARN_ON if detect_hang_count is not 0 and return. > + > + for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) { > + uint32_t mec, pipe, queue; [snip] > @@ -1244,12 +1245,32 @@ void kfd_signal_reset_event(struct kfd_node *dev) > idx = srcu_read_lock(&kfd_processes_srcu); > hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { > int user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id); > + struct kfd_process_device *pdd = kfd_get_process_device_data(dev, p); > > if (unlikely(user_gpu_id == -EINVAL)) { > WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id); > continue; > } > > + if (unlikely(!pdd)) { > + WARN_ONCE(1, "Could not get device data from pasid:0x%x\n", p->pasid); > + continue; > + } > + > + if (dev->dqm->detect_hang_count && !pdd->has_reset_queue) > + continue; > + > + if (dev->dqm->detect_hang_count) { > + struct amdgpu_task_info *ti; > + > + ti = amdgpu_vm_get_task_info_pasid(dev->adev, p->pasid); > + if (ti) { > + DRM_ERROR("Process info: process %s tid %d thread %s pid %d\n", > + ti->process_name, ti->tgid, ti->task_name, ti->pid); This looks like debug code that's not meant to be checked in. If you mean to check it in, the message needs more information. Use dev_err or dev_info to print the affected GPU and put something about a queue reset into the message. Regards, Felix > + amdgpu_vm_put_task_info(ti); > + } > + } > + > rcu_read_lock(); > > id = KFD_FIRST_NONSIGNAL_EVENT_ID; > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > index 66c73825c0a0..84e8ea3a8a0c 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > @@ -321,8 +321,11 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, > static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) > { > struct v9_mqd *m = (struct v9_mqd *)mqd; > + uint32_t doorbell_id = m->queue_doorbell_id0; > > - return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0); > + m->queue_doorbell_id0 = 0; > + > + return kfd_check_hiq_mqd_doorbell_id(mm->dev, doorbell_id, 0); > } > > static int get_wave_state(struct mqd_manager *mm, void *mqd, > @@ -624,6 +627,7 @@ static bool check_preemption_failed_v9_4_3(struct mqd_manager *mm, void *mqd) > m = get_mqd(mqd + hiq_mqd_size * inst); > ret |= kfd_check_hiq_mqd_doorbell_id(mm->dev, > m->queue_doorbell_id0, inst); > + m->queue_doorbell_id0 = 0; > ++inst; > } > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index b5cae48dff66..892a85408c09 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -843,6 +843,9 @@ struct kfd_process_device { > void *proc_ctx_bo; > uint64_t proc_ctx_gpu_addr; > void *proc_ctx_cpu_ptr; > + > + /* Tracks queue reset status */ > + bool has_reset_queue; > }; > > #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > index 9e29b92eb523..a902950cc060 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > @@ -1851,6 +1851,8 @@ int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger) > goto fail; > } > n_evicted++; > + > + pdd->dev->dqm->is_hws_hang = false; > } > > return r; > diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > index 6d094cf3587d..7744ca3ef4b1 100644 > --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > @@ -318,6 +318,12 @@ struct kfd2kgd_calls { > void (*program_trap_handler_settings)(struct amdgpu_device *adev, > uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr, > uint32_t inst); > + uint64_t (*hqd_get_pq_addr)(struct amdgpu_device *adev, > + uint32_t pipe_id, uint32_t queue_id, > + uint32_t inst); > + uint64_t (*hqd_reset)(struct amdgpu_device *adev, > + uint32_t pipe_id, uint32_t queue_id, > + uint32_t inst, unsigned int utimeout); > }; > > #endif /* KGD_KFD_INTERFACE_H_INCLUDED */