In order to allow ROCm GDB to handle reset queues, raise an EC_QUEUE_RESET exception so that the debugger can subscribe and query this exception. Reset queues should still be considered suspendable with a status flag of KFD_DBG_QUEUE_RESET_MASK. However they should not be resumable since user space will no longer be able to access reset queues. v2: move per-queue reset flag to this patch rebase based on patch 1 changes Signed-off-by: Jonathan Kim <jonathan.kim@xxxxxxx> --- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 31 ++++++++++++++++--- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + include/uapi/linux/kfd_ioctl.h | 4 +++ 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index e335703eff84..cb7b5bbf5c40 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -164,6 +164,10 @@ static void kfd_hws_hang(struct device_queue_manager *dqm) struct kfd_process_device *pdd = qpd_to_pdd(qpd); pdd->has_reset_queue = true; + q->properties.is_reset = true; + kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET), + q->process, q->device, q->doorbell_id, + false, NULL, 0); } } @@ -986,7 +990,7 @@ static int suspend_single_queue(struct device_queue_manager *dqm, { bool is_new; - if (q->properties.is_suspended) + if (q->properties.is_suspended || q->properties.is_reset) return 0; pr_debug("Suspending PASID %u queue [%i]\n", @@ -1007,6 +1011,9 @@ static int suspend_single_queue(struct device_queue_manager *dqm, if (dqm->dev->kfd->shared_resources.enable_mes) { int r = remove_queue_mes(dqm, q, &pdd->qpd); + if (q->properties.is_reset) + return 0; + if (r) return r; } @@ -1967,10 +1974,14 @@ static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q q->properties.queue_id, q->process->pasid); pdd->has_reset_queue = true; + q->properties.is_reset = true; if (q->properties.is_active) { q->properties.is_active = false; decrement_queue_count(dqm, qpd, q); } + + kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET), q->process, q->device, + q->doorbell_id, false, NULL, 0); } static int detect_queue_hang(struct device_queue_manager *dqm) @@ -3037,7 +3048,8 @@ int resume_queues(struct kfd_process *p, queue_ids[q_idx] &= ~KFD_DBG_QUEUE_INVALID_MASK; } else { - queue_ids[q_idx] |= + queue_ids[q_idx] |= q->properties.is_reset ? + KFD_DBG_QUEUE_RESET_MASK : KFD_DBG_QUEUE_ERROR_MASK; break; } @@ -3072,7 +3084,7 @@ int resume_queues(struct kfd_process *p, queue_ids); /* mask queue as error on resume fail */ - if (q_idx != QUEUE_NOT_FOUND) + if (q_idx != QUEUE_NOT_FOUND && !q->properties.is_reset) queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK; } @@ -3119,6 +3131,7 @@ int suspend_queues(struct kfd_process *p, struct qcm_process_device *qpd = &pdd->qpd; struct queue *q; int r, per_device_suspended = 0; + bool has_queue_reset_fail = false; mutex_lock(&p->event_mutex); dqm_lock(dqm); @@ -3135,6 +3148,9 @@ int suspend_queues(struct kfd_process *p, if (!err) { queue_ids[q_idx] &= ~KFD_DBG_QUEUE_INVALID_MASK; + if (q->properties.is_reset) + queue_ids[q_idx] |= KFD_DBG_QUEUE_RESET_MASK; + if (exception_clear_mask && is_mes) q->properties.exception_status &= ~exception_clear_mask; @@ -3176,13 +3192,18 @@ int suspend_queues(struct kfd_process *p, continue; /* mask queue as error on suspend fail */ - if (r) + if (r && !q->properties.is_reset) { + has_queue_reset_fail = true; queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK; - else if (exception_clear_mask) + } else if (exception_clear_mask) { q->properties.exception_status &= ~exception_clear_mask; + } } + if (!has_queue_reset_fail) + total_suspended += per_device_suspended; + dqm_unlock(dqm); mutex_unlock(&p->event_mutex); amdgpu_device_flush_hdp(dqm->dev->adev, NULL); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 892a85408c09..192e3102c152 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -504,6 +504,7 @@ struct queue_properties { bool is_being_destroyed; bool is_active; bool is_gws; + bool is_reset; uint32_t pm4_target_xcc; bool is_dbg_wa; bool is_user_cu_masked; diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 285a36601dc9..4713f9a6796e 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -875,6 +875,7 @@ enum kfd_dbg_trap_exception_code { EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID = 21, EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID = 22, EC_QUEUE_PACKET_VENDOR_UNSUPPORTED = 23, + EC_QUEUE_RESET = 29, EC_QUEUE_PREEMPTION_ERROR = 30, EC_QUEUE_NEW = 31, /* per device */ @@ -907,6 +908,7 @@ enum kfd_dbg_trap_exception_code { KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID) | \ KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) | \ KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED) | \ + KFD_EC_MASK(EC_QUEUE_RESET) | \ KFD_EC_MASK(EC_QUEUE_PREEMPTION_ERROR) | \ KFD_EC_MASK(EC_QUEUE_NEW)) #define KFD_EC_MASK_DEVICE (KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE) | \ @@ -997,8 +999,10 @@ struct kfd_queue_snapshot_entry { }; /* Queue status return for suspend/resume */ +#define KFD_DBG_QUEUE_RESET_BIT 29 #define KFD_DBG_QUEUE_ERROR_BIT 30 #define KFD_DBG_QUEUE_INVALID_BIT 31 +#define KFD_DBG_QUEUE_RESET_MASK (1 << KFD_DBG_QUEUE_RESET_BIT) #define KFD_DBG_QUEUE_ERROR_MASK (1 << KFD_DBG_QUEUE_ERROR_BIT) #define KFD_DBG_QUEUE_INVALID_MASK (1 << KFD_DBG_QUEUE_INVALID_BIT) -- 2.34.1