In order to allow ROCm GDB to handle reset queues, raise an EC_QUEUE_RESET exception so that the debugger can subscribe and query this exception. Reset queues should still be considered suspendable with a status flag of KFD_DBG_QUEUE_RESET_MASK. However they should not be resumable since user space will no longer be able to access reset queues. Signed-off-by: Jonathan Kim <jonathan.kim@xxxxxxx> --- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 30 +++++++++++++++---- include/uapi/linux/kfd_ioctl.h | 4 +++ 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 037b75a64e66..8c41806df39e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -163,6 +163,10 @@ static void kfd_hws_hang(struct device_queue_manager *dqm) list_for_each_entry(q, &qpd->queues_list, list) { q->properties.is_reset = true; q->process->has_reset_queue = true; + + kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET), + q->process, q->device, q->doorbell_id, + false, NULL, 0); } } @@ -987,7 +991,7 @@ static int suspend_single_queue(struct device_queue_manager *dqm, { bool is_new; - if (q->properties.is_suspended) + if (q->properties.is_suspended || q->properties.is_reset) return 0; pr_debug("Suspending PASID %u queue [%i]\n", @@ -1008,6 +1012,9 @@ static int suspend_single_queue(struct device_queue_manager *dqm, if (dqm->dev->kfd->shared_resources.enable_mes) { int r = remove_queue_mes(dqm, q, &pdd->qpd); + if (q->properties.is_reset) + return 0; + if (r) return r; } @@ -1971,6 +1978,9 @@ static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q q->properties.is_active = false; decrement_queue_count(dqm, qpd, q); } + + kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET), q->process, q->device, + q->doorbell_id, false, NULL, 0); } static int detect_queue_hang(struct device_queue_manager *dqm) @@ -3036,7 +3046,8 @@ int resume_queues(struct kfd_process *p, queue_ids[q_idx] &= ~KFD_DBG_QUEUE_INVALID_MASK; } else { - queue_ids[q_idx] |= + queue_ids[q_idx] |= q->properties.is_reset ? + KFD_DBG_QUEUE_RESET_MASK : KFD_DBG_QUEUE_ERROR_MASK; break; } @@ -3071,7 +3082,7 @@ int resume_queues(struct kfd_process *p, queue_ids); /* mask queue as error on resume fail */ - if (q_idx != QUEUE_NOT_FOUND) + if (q_idx != QUEUE_NOT_FOUND && !q->properties.is_reset) queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK; } @@ -3118,6 +3129,7 @@ int suspend_queues(struct kfd_process *p, struct qcm_process_device *qpd = &pdd->qpd; struct queue *q; int r, per_device_suspended = 0; + bool has_queue_reset_fail = false; mutex_lock(&p->event_mutex); dqm_lock(dqm); @@ -3134,6 +3146,9 @@ int suspend_queues(struct kfd_process *p, if (!err) { queue_ids[q_idx] &= ~KFD_DBG_QUEUE_INVALID_MASK; + if (q->properties.is_reset) + queue_ids[q_idx] |= KFD_DBG_QUEUE_RESET_MASK; + if (exception_clear_mask && is_mes) q->properties.exception_status &= ~exception_clear_mask; @@ -3175,13 +3190,18 @@ int suspend_queues(struct kfd_process *p, continue; /* mask queue as error on suspend fail */ - if (r) + if (r && !q->properties.is_reset) { + has_queue_reset_fail = true; queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK; - else if (exception_clear_mask) + } else if (exception_clear_mask) { q->properties.exception_status &= ~exception_clear_mask; + } } + if (!has_queue_reset_fail) + total_suspended += per_device_suspended; + dqm_unlock(dqm); mutex_unlock(&p->event_mutex); amdgpu_device_flush_hdp(dqm->dev->adev, NULL); diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 285a36601dc9..4713f9a6796e 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -875,6 +875,7 @@ enum kfd_dbg_trap_exception_code { EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID = 21, EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID = 22, EC_QUEUE_PACKET_VENDOR_UNSUPPORTED = 23, + EC_QUEUE_RESET = 29, EC_QUEUE_PREEMPTION_ERROR = 30, EC_QUEUE_NEW = 31, /* per device */ @@ -907,6 +908,7 @@ enum kfd_dbg_trap_exception_code { KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID) | \ KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) | \ KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED) | \ + KFD_EC_MASK(EC_QUEUE_RESET) | \ KFD_EC_MASK(EC_QUEUE_PREEMPTION_ERROR) | \ KFD_EC_MASK(EC_QUEUE_NEW)) #define KFD_EC_MASK_DEVICE (KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE) | \ @@ -997,8 +999,10 @@ struct kfd_queue_snapshot_entry { }; /* Queue status return for suspend/resume */ +#define KFD_DBG_QUEUE_RESET_BIT 29 #define KFD_DBG_QUEUE_ERROR_BIT 30 #define KFD_DBG_QUEUE_INVALID_BIT 31 +#define KFD_DBG_QUEUE_RESET_MASK (1 << KFD_DBG_QUEUE_RESET_BIT) #define KFD_DBG_QUEUE_ERROR_MASK (1 << KFD_DBG_QUEUE_ERROR_BIT) #define KFD_DBG_QUEUE_INVALID_MASK (1 << KFD_DBG_QUEUE_INVALID_BIT) -- 2.34.1