[Public] > -----Original Message----- > From: Kuehling, Felix <Felix.Kuehling@xxxxxxx> > Sent: Wednesday, July 31, 2024 11:45 AM > To: Kim, Jonathan <Jonathan.Kim@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx> > Subject: Re: [PATCH] drm/amdkfd: support per-queue reset on gfx9 > > > On 2024-07-31 09:37, Jonathan Kim wrote: > > Support per-queue reset for GFX9. The recommendation is for the driver > > to target reset the HW queue via a SPI MMIO register write. > > > > Since this requires pipe and HW queue info and MEC FW is limited to > > doorbell reports of hung queues after an unmap failure, scan the HW > > queue slots defined by SET_RESOURCES first to identify the user queue > > candidates to reset. > > > > Only signal reset events to processes that have had a queue reset. > > > > If queue reset fails, fall back to GPU reset. > > > > v3: address nitpicks > > - handle hang detect buffer ENOMEM > > - warn on multiple detect hang misuse > > - reset hang detect buffer to NULL on free > > - update DRM_ERR on reset to drm_err app warning message > > I meant dev_err here to make sure we print the device identifier. That's > what we mostly use in KFD. If drm_err does the same, that's fine, too. > Looking at the definitions in drm_print.h, the only thing that drm_err > adds is a "[drm]" tag in the message. > > See one more comment inline. > > > > > > v2: move reset queue flag for house keeping to process device. > > split detect and reset into separate functions. > > make reset call safe during power saving modes. > > clean up some other nitpicks. > > > > Signed-off-by: Jonathan Kim <jonathan.kim@xxxxxxx> > > [snip] > > @@ -1929,6 +1966,135 @@ static int map_queues_cpsch(struct > device_queue_manager *dqm) > > return retval; > > } > > > > +static void set_queue_as_reset(struct device_queue_manager *dqm, struct > queue *q, > > + struct qcm_process_device *qpd) > > +{ > > + struct kfd_process_device *pdd = qpd_to_pdd(qpd); > > + > > + pr_err("queue id 0x%0x at pasid 0x%0x is reset\n", > > + q->properties.queue_id, q->process->pasid); > > This could also be a dev_err(dqm->dev->adev->dev, ...) or > drm_err(dqm->dev->adev->ddev, ...). With that fixed, the patch is > > Reviewed-by: Felix Kuehling <felix.kuehling@xxxxxxx> Done. Changed both to dev_err. Thanks for the review. Jon > > > > + > > + pdd->has_reset_queue = true; > > + if (q->properties.is_active) { > > + q->properties.is_active = false; > > + decrement_queue_count(dqm, qpd, q); > > + } > > +} > > + > > +static int detect_queue_hang(struct device_queue_manager *dqm) > > +{ > > + int i; > > + > > + /* detect should be used only in dqm locked queue reset */ > > + if (WARN_ON(dqm->detect_hang_count > 0)) > > + return 0; > > + > > + memset(dqm->detect_hang_info, 0, dqm->detect_hang_info_size); > > + > > + for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) { > > + uint32_t mec, pipe, queue; > > + int xcc_id; > > + > > + mec = (i / dqm->dev->kfd- > >shared_resources.num_queue_per_pipe) > > + / dqm->dev->kfd- > >shared_resources.num_pipe_per_mec; > > + > > + if (mec || !test_bit(i, dqm->dev->kfd- > >shared_resources.cp_queue_bitmap)) > > + continue; > > + > > + amdgpu_queue_mask_bit_to_mec_queue(dqm->dev->adev, > i, &mec, &pipe, &queue); > > + > > + for_each_inst(xcc_id, dqm->dev->xcc_mask) { > > + uint64_t queue_addr = dqm->dev->kfd2kgd- > >hqd_get_pq_addr( > > + dqm->dev->adev, pipe, > queue, xcc_id); > > + struct dqm_detect_hang_info hang_info; > > + > > + if (!queue_addr) > > + continue; > > + > > + hang_info.pipe_id = pipe; > > + hang_info.queue_id = queue; > > + hang_info.xcc_id = xcc_id; > > + hang_info.queue_address = queue_addr; > > + > > + dqm->detect_hang_info[dqm->detect_hang_count] = > hang_info; > > + dqm->detect_hang_count++; > > + } > > + } > > + > > + return dqm->detect_hang_count; > > +} > > + > > +static struct queue *find_queue_by_address(struct device_queue_manager > *dqm, uint64_t queue_address) > > +{ > > + struct device_process_node *cur; > > + struct qcm_process_device *qpd; > > + struct queue *q; > > + > > + list_for_each_entry(cur, &dqm->queues, list) { > > + qpd = cur->qpd; > > + list_for_each_entry(q, &qpd->queues_list, list) { > > + if (queue_address == q->properties.queue_address) > > + return q; > > + } > > + } > > + > > + return NULL; > > +} > > + > > +/* only for compute queue */ > > +static int reset_queues_on_hws_hang(struct device_queue_manager > *dqm) > > +{ > > + int r = 0, reset_count = 0, i; > > + > > + if (!dqm->detect_hang_info || dqm->is_hws_hang) > > + return -EIO; > > + > > + /* assume dqm locked. */ > > + if (!detect_queue_hang(dqm)) > > + return -ENOTRECOVERABLE; > > + > > + for (i = 0; i < dqm->detect_hang_count; i++) { > > + struct dqm_detect_hang_info hang_info = dqm- > >detect_hang_info[i]; > > + struct queue *q = find_queue_by_address(dqm, > hang_info.queue_address); > > + struct kfd_process_device *pdd; > > + uint64_t queue_addr = 0; > > + > > + if (!q) { > > + r = -ENOTRECOVERABLE; > > + goto reset_fail; > > + } > > + > > + pdd = kfd_get_process_device_data(dqm->dev, q->process); > > + if (!pdd) { > > + r = -ENOTRECOVERABLE; > > + goto reset_fail; > > + } > > + > > + queue_addr = dqm->dev->kfd2kgd->hqd_reset(dqm->dev- > >adev, > > + hang_info.pipe_id, hang_info.queue_id, > hang_info.xcc_id, > > + KFD_UNMAP_LATENCY_MS); > > + > > + /* either reset failed or we reset an unexpected queue. */ > > + if (queue_addr != q->properties.queue_address) { > > + r = -ENOTRECOVERABLE; > > + goto reset_fail; > > + } > > + > > + set_queue_as_reset(dqm, q, &pdd->qpd); > > + reset_count++; > > + } > > + > > + if (reset_count == dqm->detect_hang_count) > > + kfd_signal_reset_event(dqm->dev); > > + else > > + r = -ENOTRECOVERABLE; > > + > > +reset_fail: > > + dqm->detect_hang_count = 0; > > + > > + return r; > > +} > > + > > /* dqm->lock mutex has to be locked before calling this function */ > > static int unmap_queues_cpsch(struct device_queue_manager *dqm, > > enum kfd_unmap_queues_filter filter, > > @@ -1979,11 +2145,14 @@ static int unmap_queues_cpsch(struct > device_queue_manager *dqm, > > */ > > mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]; > > if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm- > >packet_mgr.priv_queue->queue->mqd)) { > > - while (halt_if_hws_hang) > > - schedule(); > > - kfd_hws_hang(dqm); > > - retval = -ETIME; > > - goto out; > > + if (reset_queues_on_hws_hang(dqm)) { > > + while (halt_if_hws_hang) > > + schedule(); > > + dqm->is_hws_hang = true; > > + kfd_hws_hang(dqm); > > + retval = -ETIME; > > + goto out; > > + } > > } > > > > /* We need to reset the grace period value for this device */ > > @@ -2002,8 +2171,7 @@ static int unmap_queues_cpsch(struct > device_queue_manager *dqm, > > } > > > > /* only for compute queue */ > > -static int reset_queues_cpsch(struct device_queue_manager *dqm, > > - uint16_t pasid) > > +static int reset_queues_cpsch(struct device_queue_manager *dqm, > uint16_t pasid) > > { > > int retval; > > > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h > > index 3b9b8eabaacc..dfb36a246637 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h > > @@ -210,6 +210,13 @@ struct device_queue_manager_asic_ops { > > struct kfd_node *dev); > > }; > > > > +struct dqm_detect_hang_info { > > + int pipe_id; > > + int queue_id; > > + int xcc_id; > > + uint64_t queue_address; > > +}; > > + > > /** > > * struct device_queue_manager > > * > > @@ -264,6 +271,11 @@ struct device_queue_manager { > > uint32_t wait_times; > > > > wait_queue_head_t destroy_wait; > > + > > + /* for per-queue reset support */ > > + struct dqm_detect_hang_info *detect_hang_info; > > + size_t detect_hang_info_size; > > + int detect_hang_count; > > }; > > > > void device_queue_manager_init_cik( > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c > b/drivers/gpu/drm/amd/amdkfd/kfd_events.c > > index 9b33d9d2c9ad..9393ddc2e828 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c > > @@ -31,6 +31,7 @@ > > #include <linux/memory.h> > > #include "kfd_priv.h" > > #include "kfd_events.h" > > +#include "kfd_device_queue_manager.h" > > #include <linux/device.h> > > > > /* > > @@ -1244,12 +1245,33 @@ void kfd_signal_reset_event(struct kfd_node > *dev) > > idx = srcu_read_lock(&kfd_processes_srcu); > > hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { > > int user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id); > > + struct kfd_process_device *pdd = > kfd_get_process_device_data(dev, p); > > > > if (unlikely(user_gpu_id == -EINVAL)) { > > WARN_ONCE(1, "Could not get user_gpu_id from > dev->id:%x\n", dev->id); > > continue; > > } > > > > + if (unlikely(!pdd)) { > > + WARN_ONCE(1, "Could not get device data from > pasid:0x%x\n", p->pasid); > > + continue; > > + } > > + > > + if (dev->dqm->detect_hang_count && !pdd- > >has_reset_queue) > > + continue; > > + > > + if (dev->dqm->detect_hang_count) { > > + struct amdgpu_task_info *ti; > > + > > + ti = amdgpu_vm_get_task_info_pasid(dev->adev, p- > >pasid); > > + if (ti) { > > + drm_err(&dev->adev->ddev, > > + "Queues reset on process %s tid %d > thread %s pid %d\n", > > + ti->process_name, ti->tgid, ti- > >task_name, ti->pid); > > + amdgpu_vm_put_task_info(ti); > > + } > > + } > > + > > rcu_read_lock(); > > > > id = KFD_FIRST_NONSIGNAL_EVENT_ID; > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > > index 66c73825c0a0..84e8ea3a8a0c 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > > @@ -321,8 +321,11 @@ static void update_mqd(struct mqd_manager > *mm, void *mqd, > > static bool check_preemption_failed(struct mqd_manager *mm, void > *mqd) > > { > > struct v9_mqd *m = (struct v9_mqd *)mqd; > > + uint32_t doorbell_id = m->queue_doorbell_id0; > > > > - return kfd_check_hiq_mqd_doorbell_id(mm->dev, m- > >queue_doorbell_id0, 0); > > + m->queue_doorbell_id0 = 0; > > + > > + return kfd_check_hiq_mqd_doorbell_id(mm->dev, doorbell_id, 0); > > } > > > > static int get_wave_state(struct mqd_manager *mm, void *mqd, > > @@ -624,6 +627,7 @@ static bool check_preemption_failed_v9_4_3(struct > mqd_manager *mm, void *mqd) > > m = get_mqd(mqd + hiq_mqd_size * inst); > > ret |= kfd_check_hiq_mqd_doorbell_id(mm->dev, > > m->queue_doorbell_id0, inst); > > + m->queue_doorbell_id0 = 0; > > ++inst; > > } > > > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > > index b5cae48dff66..892a85408c09 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > > @@ -843,6 +843,9 @@ struct kfd_process_device { > > void *proc_ctx_bo; > > uint64_t proc_ctx_gpu_addr; > > void *proc_ctx_cpu_ptr; > > + > > + /* Tracks queue reset status */ > > + bool has_reset_queue; > > }; > > > > #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > > index 9e29b92eb523..a902950cc060 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > > @@ -1851,6 +1851,8 @@ int kfd_process_evict_queues(struct kfd_process > *p, uint32_t trigger) > > goto fail; > > } > > n_evicted++; > > + > > + pdd->dev->dqm->is_hws_hang = false; > > } > > > > return r; > > diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > > index 6d094cf3587d..7744ca3ef4b1 100644 > > --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > > +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > > @@ -318,6 +318,12 @@ struct kfd2kgd_calls { > > void (*program_trap_handler_settings)(struct amdgpu_device *adev, > > uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr, > > uint32_t inst); > > + uint64_t (*hqd_get_pq_addr)(struct amdgpu_device *adev, > > + uint32_t pipe_id, uint32_t queue_id, > > + uint32_t inst); > > + uint64_t (*hqd_reset)(struct amdgpu_device *adev, > > + uint32_t pipe_id, uint32_t queue_id, > > + uint32_t inst, unsigned int utimeout); > > }; > > > > #endif /* KGD_KFD_INTERFACE_H_INCLUDED */