[Public] > -----Original Message----- > From: Kuehling, Felix <Felix.Kuehling@xxxxxxx> > Sent: Tuesday, July 30, 2024 6:07 PM > To: Kim, Jonathan <Jonathan.Kim@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx> > Subject: Re: [PATCH 1/2] drm/amdkfd: support per-queue reset on gfx9 > > > On 2024-07-26 11:30, Jonathan Kim wrote: > > Support per-queue reset for GFX9. The recommendation is for the driver > > to target reset the HW queue via a SPI MMIO register write. > > > > Since this requires pipe and HW queue info and MEC FW is limited to > > doorbell reports of hung queues after an unmap failure, scan the HW > > queue slots defined by SET_RESOURCES first to identify the user queue > > candidates to reset. > > > > Only signal reset events to processes that have had a queue reset. > > > > If queue reset fails, fall back to GPU reset. > > > > v2: move reset queue flag for house keeping to process device. > > split detect and reset into separate functions. > > make reset call safe during power saving modes. > > clean up some other nitpicks. > > Some more nit-picks inline. > > > > > Signed-off-by: Jonathan Kim <jonathan.kim@xxxxxxx> > > --- > > .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c | 2 + > > .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c | 4 +- > > .../drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c | 4 +- > > .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c | 16 ++ > > .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h | 9 + > > .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c | 4 +- > > .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c | 18 +- > > .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 85 +++++++++ > > .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h | 9 + > > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 172 > +++++++++++++++++- > > .../drm/amd/amdkfd/kfd_device_queue_manager.h | 12 ++ > > drivers/gpu/drm/amd/amdkfd/kfd_events.c | 21 +++ > > .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 6 +- > > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 + > > drivers/gpu/drm/amd/amdkfd/kfd_process.c | 2 + > > .../gpu/drm/amd/include/kgd_kfd_interface.h | 6 + > > 16 files changed, 360 insertions(+), 13 deletions(-) > > > [snip] > > @@ -1680,6 +1700,14 @@ static int start_cpsch(struct > device_queue_manager *dqm) > > &dqm->wait_times); > > } > > > > + /* setup per-queue reset detection buffer */ > > + num_hw_queue_slots = dqm->dev->kfd- > >shared_resources.num_queue_per_pipe * > > + dqm->dev->kfd- > >shared_resources.num_pipe_per_mec * > > + NUM_XCC(dqm->dev->xcc_mask); > > + > > + dqm->detect_hang_info_size = num_hw_queue_slots * sizeof(struct > dqm_detect_hang_info); > > + dqm->detect_hang_info = kzalloc(dqm->detect_hang_info_size, > GFP_KERNEL); > > You need to check the return value and handle allocation failures. > > > + > > dqm_unlock(dqm); > > > > return 0; > > @@ -1713,6 +1741,7 @@ static int stop_cpsch(struct > device_queue_manager *dqm) > > kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); > > if (!dqm->dev->kfd->shared_resources.enable_mes) > > pm_uninit(&dqm->packet_mgr); > > + kfree(dqm->detect_hang_info); > > Reset dqm->detect_hang_info to NULL to avoid a dangling pointer. > > > dqm_unlock(dqm); > > > > return 0; > > @@ -1929,6 +1958,131 @@ static int map_queues_cpsch(struct > device_queue_manager *dqm) > > return retval; > > } > > > > +static void set_queue_as_reset(struct device_queue_manager *dqm, struct > queue *q, > > + struct qcm_process_device *qpd) > > +{ > > + struct kfd_process_device *pdd = qpd_to_pdd(qpd); > > + > > + pr_err("queue id 0x%0x at pasid 0x%0x is reset\n", > > + q->properties.queue_id, q->process->pasid); > > + > > + pdd->has_reset_queue = true; > > + if (q->properties.is_active) { > > + q->properties.is_active = false; > > + decrement_queue_count(dqm, qpd, q); > > + } > > +} > > + > > +static int detect_queue_hang(struct device_queue_manager *dqm) > > +{ > > + int i; > > + > > + memset(dqm->detect_hang_info, 0, dqm->detect_hang_info_size); > > Set dqm->detect_hang_count to 0 to avoid overflows in case multiple hand > detections get kicked off. Or if that's not possible, just print a WARN_ON if > detect_hang_count is not 0 and return. > > > + > > + for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) { > > + uint32_t mec, pipe, queue; > [snip] > > @@ -1244,12 +1245,32 @@ void kfd_signal_reset_event(struct kfd_node > *dev) > > idx = srcu_read_lock(&kfd_processes_srcu); > > hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { > > int user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id); > > + struct kfd_process_device *pdd = > kfd_get_process_device_data(dev, p); > > > > if (unlikely(user_gpu_id == -EINVAL)) { > > WARN_ONCE(1, "Could not get user_gpu_id from > dev->id:%x\n", dev->id); > > continue; > > } > > > > + if (unlikely(!pdd)) { > > + WARN_ONCE(1, "Could not get device data from > pasid:0x%x\n", p->pasid); > > + continue; > > + } > > + > > + if (dev->dqm->detect_hang_count && !pdd- > >has_reset_queue) > > + continue; > > + > > + if (dev->dqm->detect_hang_count) { > > + struct amdgpu_task_info *ti; > > + > > + ti = amdgpu_vm_get_task_info_pasid(dev->adev, p- > >pasid); > > + if (ti) { > > + DRM_ERROR("Process info: process %s tid %d > thread %s pid %d\n", > > + ti->process_name, ti->tgid, ti->task_name, ti- > >pid); > > This looks like debug code that's not meant to be checked in. If you mean to > check it in, the message needs more information. Use dev_err or dev_info to > print the affected GPU and put something about a queue reset into the > message. Thanks for the review. Yes this was meant to be in as it's an ask to flag offending applications in dmesgs. I'll update it with your suggestions. Jon > > Regards, > Felix > > > > + amdgpu_vm_put_task_info(ti); > > + } > > + } > > + > > rcu_read_lock(); > > > > id = KFD_FIRST_NONSIGNAL_EVENT_ID; > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > > index 66c73825c0a0..84e8ea3a8a0c 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > > @@ -321,8 +321,11 @@ static void update_mqd(struct mqd_manager > *mm, void *mqd, > > static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) > > { > > struct v9_mqd *m = (struct v9_mqd *)mqd; > > + uint32_t doorbell_id = m->queue_doorbell_id0; > > > > - return kfd_check_hiq_mqd_doorbell_id(mm->dev, m- > >queue_doorbell_id0, 0); > > + m->queue_doorbell_id0 = 0; > > + > > + return kfd_check_hiq_mqd_doorbell_id(mm->dev, doorbell_id, 0); > > } > > > > static int get_wave_state(struct mqd_manager *mm, void *mqd, > > @@ -624,6 +627,7 @@ static bool check_preemption_failed_v9_4_3(struct > mqd_manager *mm, void *mqd) > > m = get_mqd(mqd + hiq_mqd_size * inst); > > ret |= kfd_check_hiq_mqd_doorbell_id(mm->dev, > > m->queue_doorbell_id0, inst); > > + m->queue_doorbell_id0 = 0; > > ++inst; > > } > > > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > > index b5cae48dff66..892a85408c09 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > > @@ -843,6 +843,9 @@ struct kfd_process_device { > > void *proc_ctx_bo; > > uint64_t proc_ctx_gpu_addr; > > void *proc_ctx_cpu_ptr; > > + > > + /* Tracks queue reset status */ > > + bool has_reset_queue; > > }; > > > > #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > > index 9e29b92eb523..a902950cc060 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > > @@ -1851,6 +1851,8 @@ int kfd_process_evict_queues(struct kfd_process > *p, uint32_t trigger) > > goto fail; > > } > > n_evicted++; > > + > > + pdd->dev->dqm->is_hws_hang = false; > > } > > > > return r; > > diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > > index 6d094cf3587d..7744ca3ef4b1 100644 > > --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > > +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > > @@ -318,6 +318,12 @@ struct kfd2kgd_calls { > > void (*program_trap_handler_settings)(struct amdgpu_device *adev, > > uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr, > > uint32_t inst); > > + uint64_t (*hqd_get_pq_addr)(struct amdgpu_device *adev, > > + uint32_t pipe_id, uint32_t queue_id, > > + uint32_t inst); > > + uint64_t (*hqd_reset)(struct amdgpu_device *adev, > > + uint32_t pipe_id, uint32_t queue_id, > > + uint32_t inst, unsigned int utimeout); > > }; > > > > #endif /* KGD_KFD_INTERFACE_H_INCLUDED */