On 2024-07-26 11:30, Jonathan Kim wrote: > In order to allow ROCm GDB to handle reset queues, raise an > EC_QUEUE_RESET exception so that the debugger can subscribe and > query this exception. > > Reset queues should still be considered suspendable with a status > flag of KFD_DBG_QUEUE_RESET_MASK. > However they should not be resumable since user space will no longer > be able to access reset queues. > > v2: move per-queue reset flag to this patch > rebase based on patch 1 changes > > Signed-off-by: Jonathan Kim <jonathan.kim@xxxxxxx> > --- > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 31 ++++++++++++++++--- > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + > include/uapi/linux/kfd_ioctl.h | 4 +++ > 3 files changed, 31 insertions(+), 5 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index e335703eff84..cb7b5bbf5c40 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -164,6 +164,10 @@ static void kfd_hws_hang(struct device_queue_manager *dqm) > struct kfd_process_device *pdd = qpd_to_pdd(qpd); > > pdd->has_reset_queue = true; > + q->properties.is_reset = true; > + kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET), > + q->process, q->device, q->doorbell_id, > + false, NULL, 0); > } > } > > @@ -986,7 +990,7 @@ static int suspend_single_queue(struct device_queue_manager *dqm, > { > bool is_new; > > - if (q->properties.is_suspended) > + if (q->properties.is_suspended || q->properties.is_reset) > return 0; > > pr_debug("Suspending PASID %u queue [%i]\n", > @@ -1007,6 +1011,9 @@ static int suspend_single_queue(struct device_queue_manager *dqm, > if (dqm->dev->kfd->shared_resources.enable_mes) { > int r = remove_queue_mes(dqm, q, &pdd->qpd); > > + if (q->properties.is_reset) > + return 0; > + > if (r) > return r; > } > @@ -1967,10 +1974,14 @@ static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q > q->properties.queue_id, q->process->pasid); > > pdd->has_reset_queue = true; > + q->properties.is_reset = true; > if (q->properties.is_active) { > q->properties.is_active = false; > decrement_queue_count(dqm, qpd, q); > } > + > + kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET), q->process, q->device, > + q->doorbell_id, false, NULL, 0); > } > > static int detect_queue_hang(struct device_queue_manager *dqm) > @@ -3037,7 +3048,8 @@ int resume_queues(struct kfd_process *p, > queue_ids[q_idx] &= > ~KFD_DBG_QUEUE_INVALID_MASK; > } else { > - queue_ids[q_idx] |= > + queue_ids[q_idx] |= q->properties.is_reset ? > + KFD_DBG_QUEUE_RESET_MASK : > KFD_DBG_QUEUE_ERROR_MASK; > break; > } > @@ -3072,7 +3084,7 @@ int resume_queues(struct kfd_process *p, > queue_ids); > > /* mask queue as error on resume fail */ > - if (q_idx != QUEUE_NOT_FOUND) > + if (q_idx != QUEUE_NOT_FOUND && !q->properties.is_reset) > queue_ids[q_idx] |= > KFD_DBG_QUEUE_ERROR_MASK; > } > @@ -3119,6 +3131,7 @@ int suspend_queues(struct kfd_process *p, > struct qcm_process_device *qpd = &pdd->qpd; > struct queue *q; > int r, per_device_suspended = 0; > + bool has_queue_reset_fail = false; > > mutex_lock(&p->event_mutex); > dqm_lock(dqm); > @@ -3135,6 +3148,9 @@ int suspend_queues(struct kfd_process *p, > > if (!err) { > queue_ids[q_idx] &= ~KFD_DBG_QUEUE_INVALID_MASK; > + if (q->properties.is_reset) > + queue_ids[q_idx] |= KFD_DBG_QUEUE_RESET_MASK; > + > if (exception_clear_mask && is_mes) > q->properties.exception_status &= > ~exception_clear_mask; > @@ -3176,13 +3192,18 @@ int suspend_queues(struct kfd_process *p, > continue; > > /* mask queue as error on suspend fail */ > - if (r) > + if (r && !q->properties.is_reset) { > + has_queue_reset_fail = true; > queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK; > - else if (exception_clear_mask) > + } else if (exception_clear_mask) { > q->properties.exception_status &= > ~exception_clear_mask; > + } > } > > + if (!has_queue_reset_fail) > + total_suspended += per_device_suspended; > + > dqm_unlock(dqm); > mutex_unlock(&p->event_mutex); > amdgpu_device_flush_hdp(dqm->dev->adev, NULL); > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index 892a85408c09..192e3102c152 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -504,6 +504,7 @@ struct queue_properties { > bool is_being_destroyed; > bool is_active; > bool is_gws; > + bool is_reset; > uint32_t pm4_target_xcc; > bool is_dbg_wa; > bool is_user_cu_masked; > diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h > index 285a36601dc9..4713f9a6796e 100644 > --- a/include/uapi/linux/kfd_ioctl.h > +++ b/include/uapi/linux/kfd_ioctl.h > @@ -875,6 +875,7 @@ enum kfd_dbg_trap_exception_code { > EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID = 21, > EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID = 22, > EC_QUEUE_PACKET_VENDOR_UNSUPPORTED = 23, > + EC_QUEUE_RESET = 29, > EC_QUEUE_PREEMPTION_ERROR = 30, Do we really need to distinguish between preemption errors and resets? I mean, both are caused by preemption errors. The only difference is, that in the "reset" case we succeeded in resetting only the affected queues. > EC_QUEUE_NEW = 31, > /* per device */ > @@ -907,6 +908,7 @@ enum kfd_dbg_trap_exception_code { > KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID) | \ > KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) | \ > KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED) | \ > + KFD_EC_MASK(EC_QUEUE_RESET) | \ > KFD_EC_MASK(EC_QUEUE_PREEMPTION_ERROR) | \ > KFD_EC_MASK(EC_QUEUE_NEW)) > #define KFD_EC_MASK_DEVICE (KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE) | \ > @@ -997,8 +999,10 @@ struct kfd_queue_snapshot_entry { > }; > > /* Queue status return for suspend/resume */ > +#define KFD_DBG_QUEUE_RESET_BIT 29 > #define KFD_DBG_QUEUE_ERROR_BIT 30 > #define KFD_DBG_QUEUE_INVALID_BIT 31 > +#define KFD_DBG_QUEUE_RESET_MASK (1 << KFD_DBG_QUEUE_RESET_BIT) Same as above. If the distinction is not necessary, we don't need to change the user mode API. Just report queue resets as generic preemption errors. Regards, Felix > #define KFD_DBG_QUEUE_ERROR_MASK (1 << KFD_DBG_QUEUE_ERROR_BIT) > #define KFD_DBG_QUEUE_INVALID_MASK (1 << KFD_DBG_QUEUE_INVALID_BIT) >