[Public] > -----Original Message----- > From: Alex Deucher <alexdeucher@xxxxxxxxx> > Sent: Friday, July 26, 2024 2:57 PM > To: Kim, Jonathan <Jonathan.Kim@xxxxxxx> > Cc: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Kuehling, Felix > <Felix.Kuehling@xxxxxxx>; Deucher, Alexander > <Alexander.Deucher@xxxxxxx> > Subject: Re: [PATCH 2/2] drm/amdkfd: support the debugger during per- > queue reset > > Caution: This message originated from an External Source. Use proper caution > when opening attachments, clicking links, or responding. > > > On Fri, Jul 26, 2024 at 11:40 AM Jonathan Kim <Jonathan.Kim@xxxxxxx> > wrote: > > > > In order to allow ROCm GDB to handle reset queues, raise an > > EC_QUEUE_RESET exception so that the debugger can subscribe and > > query this exception. > > > > Reset queues should still be considered suspendable with a status > > flag of KFD_DBG_QUEUE_RESET_MASK. > > However they should not be resumable since user space will no longer > > be able to access reset queues. > > Do you have a link to the proposed debugger code which uses this? Not yet. + Laurent/Lancelot for awareness. Jon > > Alex > > > > > v2: move per-queue reset flag to this patch > > rebase based on patch 1 changes > > > > Signed-off-by: Jonathan Kim <jonathan.kim@xxxxxxx> > > --- > > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 31 > ++++++++++++++++--- > > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + > > include/uapi/linux/kfd_ioctl.h | 4 +++ > > 3 files changed, 31 insertions(+), 5 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > > index e335703eff84..cb7b5bbf5c40 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > > @@ -164,6 +164,10 @@ static void kfd_hws_hang(struct > device_queue_manager *dqm) > > struct kfd_process_device *pdd = qpd_to_pdd(qpd); > > > > pdd->has_reset_queue = true; > > + q->properties.is_reset = true; > > + kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET), > > + q->process, q->device, q->doorbell_id, > > + false, NULL, 0); > > } > > } > > > > @@ -986,7 +990,7 @@ static int suspend_single_queue(struct > device_queue_manager *dqm, > > { > > bool is_new; > > > > - if (q->properties.is_suspended) > > + if (q->properties.is_suspended || q->properties.is_reset) > > return 0; > > > > pr_debug("Suspending PASID %u queue [%i]\n", > > @@ -1007,6 +1011,9 @@ static int suspend_single_queue(struct > device_queue_manager *dqm, > > if (dqm->dev->kfd->shared_resources.enable_mes) { > > int r = remove_queue_mes(dqm, q, &pdd->qpd); > > > > + if (q->properties.is_reset) > > + return 0; > > + > > if (r) > > return r; > > } > > @@ -1967,10 +1974,14 @@ static void set_queue_as_reset(struct > device_queue_manager *dqm, struct queue *q > > q->properties.queue_id, q->process->pasid); > > > > pdd->has_reset_queue = true; > > + q->properties.is_reset = true; > > if (q->properties.is_active) { > > q->properties.is_active = false; > > decrement_queue_count(dqm, qpd, q); > > } > > + > > + kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET), q->process, q- > >device, > > + q->doorbell_id, false, NULL, 0); > > } > > > > static int detect_queue_hang(struct device_queue_manager *dqm) > > @@ -3037,7 +3048,8 @@ int resume_queues(struct kfd_process *p, > > queue_ids[q_idx] &= > > ~KFD_DBG_QUEUE_INVALID_MASK; > > } else { > > - queue_ids[q_idx] |= > > + queue_ids[q_idx] |= q->properties.is_reset ? > > + KFD_DBG_QUEUE_RESET_MASK : > > KFD_DBG_QUEUE_ERROR_MASK; > > break; > > } > > @@ -3072,7 +3084,7 @@ int resume_queues(struct kfd_process *p, > > queue_ids); > > > > /* mask queue as error on resume fail */ > > - if (q_idx != QUEUE_NOT_FOUND) > > + if (q_idx != QUEUE_NOT_FOUND && !q- > >properties.is_reset) > > queue_ids[q_idx] |= > > KFD_DBG_QUEUE_ERROR_MASK; > > } > > @@ -3119,6 +3131,7 @@ int suspend_queues(struct kfd_process *p, > > struct qcm_process_device *qpd = &pdd->qpd; > > struct queue *q; > > int r, per_device_suspended = 0; > > + bool has_queue_reset_fail = false; > > > > mutex_lock(&p->event_mutex); > > dqm_lock(dqm); > > @@ -3135,6 +3148,9 @@ int suspend_queues(struct kfd_process *p, > > > > if (!err) { > > queue_ids[q_idx] &= > ~KFD_DBG_QUEUE_INVALID_MASK; > > + if (q->properties.is_reset) > > + queue_ids[q_idx] |= > KFD_DBG_QUEUE_RESET_MASK; > > + > > if (exception_clear_mask && is_mes) > > q->properties.exception_status &= > > ~exception_clear_mask; > > @@ -3176,13 +3192,18 @@ int suspend_queues(struct kfd_process *p, > > continue; > > > > /* mask queue as error on suspend fail */ > > - if (r) > > + if (r && !q->properties.is_reset) { > > + has_queue_reset_fail = true; > > queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK; > > - else if (exception_clear_mask) > > + } else if (exception_clear_mask) { > > q->properties.exception_status &= > > ~exception_clear_mask; > > + } > > } > > > > + if (!has_queue_reset_fail) > > + total_suspended += per_device_suspended; > > + > > dqm_unlock(dqm); > > mutex_unlock(&p->event_mutex); > > amdgpu_device_flush_hdp(dqm->dev->adev, NULL); > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > > index 892a85408c09..192e3102c152 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > > @@ -504,6 +504,7 @@ struct queue_properties { > > bool is_being_destroyed; > > bool is_active; > > bool is_gws; > > + bool is_reset; > > uint32_t pm4_target_xcc; > > bool is_dbg_wa; > > bool is_user_cu_masked; > > diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h > > index 285a36601dc9..4713f9a6796e 100644 > > --- a/include/uapi/linux/kfd_ioctl.h > > +++ b/include/uapi/linux/kfd_ioctl.h > > @@ -875,6 +875,7 @@ enum kfd_dbg_trap_exception_code { > > EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID = 21, > > EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID = 22, > > EC_QUEUE_PACKET_VENDOR_UNSUPPORTED = 23, > > + EC_QUEUE_RESET = 29, > > EC_QUEUE_PREEMPTION_ERROR = 30, > > EC_QUEUE_NEW = 31, > > /* per device */ > > @@ -907,6 +908,7 @@ enum kfd_dbg_trap_exception_code { > > > KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID > ) | \ > > > KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) | \ > > > KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED) | \ > > + KFD_EC_MASK(EC_QUEUE_RESET) | \ > > KFD_EC_MASK(EC_QUEUE_PREEMPTION_ERROR) | \ > > KFD_EC_MASK(EC_QUEUE_NEW)) > > #define KFD_EC_MASK_DEVICE > (KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE) | \ > > @@ -997,8 +999,10 @@ struct kfd_queue_snapshot_entry { > > }; > > > > /* Queue status return for suspend/resume */ > > +#define KFD_DBG_QUEUE_RESET_BIT 29 > > #define KFD_DBG_QUEUE_ERROR_BIT 30 > > #define KFD_DBG_QUEUE_INVALID_BIT 31 > > +#define KFD_DBG_QUEUE_RESET_MASK (1 << > KFD_DBG_QUEUE_RESET_BIT) > > #define KFD_DBG_QUEUE_ERROR_MASK (1 << > KFD_DBG_QUEUE_ERROR_BIT) > > #define KFD_DBG_QUEUE_INVALID_MASK (1 << > KFD_DBG_QUEUE_INVALID_BIT) > > > > -- > > 2.34.1 > >