If a queue is being destroyed but causes a HWS hang on removal, the KFD may issue an unnecessary gpu reset if the destroyed queue can be fixed by a queue reset. This is because the queue has been removed from the KFD's queue list prior to the preemption action on destroy so the reset call will fail to match the HQD PQ reset information against the KFD's queue record to do the actual reset. Since a queue destroy request is under the same device lock as any other preemption request (which subsumes queue reset calls), transiently store the destroyed queue's reference so that a potential subsequent queue reset call can check against this queue as well. Signed-off-by: Jonathan Kim <jonathan.kim@xxxxxxx> --- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 10 +++++++++- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 577d121cc6d1..09e39a72ca31 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -1842,6 +1842,8 @@ static int start_cpsch(struct device_queue_manager *dqm) goto fail_detect_hang_buffer; } + dqm->cur_destroyed_queue = NULL; + dqm_unlock(dqm); return 0; @@ -2105,7 +2107,7 @@ static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q q->properties.queue_id, q->process->pasid); pdd->has_reset_queue = true; - if (q->properties.is_active) { + if (q->properties.is_active && dqm->cur_destroyed_queue != q) { q->properties.is_active = false; decrement_queue_count(dqm, qpd, q); } @@ -2160,6 +2162,10 @@ static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uin struct qcm_process_device *qpd; struct queue *q; + if (dqm->cur_destroyed_queue && + dqm->cur_destroyed_queue->properties.queue_address == queue_address) + return dqm->cur_destroyed_queue; + list_for_each_entry(cur, &dqm->queues, list) { qpd = cur->qpd; list_for_each_entry(q, &qpd->queues_list, list) { @@ -2409,6 +2415,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, list_del(&q->list); qpd->queue_count--; + dqm->cur_destroyed_queue = q; if (q->properties.is_active) { decrement_queue_count(dqm, qpd, q); if (!dqm->dev->kfd->shared_resources.enable_mes) { @@ -2421,6 +2428,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, retval = remove_queue_mes(dqm, q, qpd); } } + dqm->cur_destroyed_queue = NULL; /* * Unconditionally decrement this counter, regardless of the queue's diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 08b40826ad1e..5425c1dd7924 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -285,6 +285,7 @@ struct device_queue_manager { struct dqm_detect_hang_info *detect_hang_info; size_t detect_hang_info_size; int detect_hang_count; + struct queue *cur_destroyed_queue; }; void device_queue_manager_init_cik( -- 2.34.1