start_cpsch and stop_cpsch can be called during kfd device initialization or during gpu reset/recovery. So they can run concurrently. Currently in start_cpsch and stop_cpsch, pm_init and pm_uninit is not protected by the dpm lock. Imagine such a case that user use packet manager's function to submit a pm4 packet to hang hws (ie through command cat /sys/class/kfd/kfd/topology/nodes/1/gpu_id | sudo tee /sys/kernel/debug/kfd/hang_hws), while kfd device is under device reset/recovery so packet manager can be not initialized. There will be unpredictable protection fault in such case. This patch moves pm_init/uninit inside the dpm lock and check packet manager is initialized before using packet manager function. Signed-off-by: Oak Zeng <Oak.Zeng@xxxxxxx> Acked-by: Christian Konig <christian.koenig@xxxxxxx> --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 8 +------- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 12 +++++++++--- drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 3 +++ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 03875d2..56b5010 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -1383,18 +1383,12 @@ void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask) */ int kfd_debugfs_hang_hws(struct kfd_dev *dev) { - int r = 0; - if (dev->dqm->sched_policy != KFD_SCHED_POLICY_HWS) { pr_err("HWS is not enabled"); return -EINVAL; } - r = pm_debugfs_hang_hws(&dev->dqm->packet_mgr); - if (!r) - r = dqm_debugfs_execute_queues(dev->dqm); - - return r; + return dqm_debugfs_hang_hws(dev->dqm); } #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 6b2f594..6b89ca6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -1164,6 +1164,7 @@ static int start_cpsch(struct device_queue_manager *dqm) retval = 0; + dqm_lock(dqm); retval = pm_init(&dqm->packet_mgr, dqm); if (retval) goto fail_packet_manager_init; @@ -1186,7 +1187,6 @@ static int start_cpsch(struct device_queue_manager *dqm) init_interrupts(dqm); - dqm_lock(dqm); /* clear hang status when driver try to start the hw scheduler */ dqm->is_hws_hang = false; dqm->is_resetting = false; @@ -1199,6 +1199,7 @@ static int start_cpsch(struct device_queue_manager *dqm) fail_set_sched_resources: pm_uninit(&dqm->packet_mgr, false); fail_packet_manager_init: + dqm_unlock(dqm); return retval; } @@ -1211,12 +1212,12 @@ static int stop_cpsch(struct device_queue_manager *dqm) unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); hanging = dqm->is_hws_hang || dqm->is_resetting; dqm->sched_running = false; - dqm_unlock(dqm); pm_release_ib(&dqm->packet_mgr); kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); pm_uninit(&dqm->packet_mgr, hanging); + dqm_unlock(dqm); return 0; } @@ -2099,11 +2100,16 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data) return r; } -int dqm_debugfs_execute_queues(struct device_queue_manager *dqm) +int dqm_debugfs_hang_hws(struct device_queue_manager *dqm) { int r = 0; dqm_lock(dqm); + r = pm_debugfs_hang_hws(&dqm->packet_mgr); + if (r) { + dqm_unlock(dqm); + return r; + } dqm->active_runlist = true; r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); dqm_unlock(dqm); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index b130cc0..b33ebe8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -448,6 +448,9 @@ int pm_debugfs_hang_hws(struct packet_manager *pm) uint32_t *buffer, size; int r = 0; + if (!pm->priv_queue) + return -EAGAIN; + size = pm->pmf->query_status_size; mutex_lock(&pm->lock); kq_acquire_packet_buffer(pm->priv_queue, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 3426743..8a5dfda 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -1194,7 +1194,7 @@ int pm_debugfs_runlist(struct seq_file *m, void *data); int kfd_debugfs_hang_hws(struct kfd_dev *dev); int pm_debugfs_hang_hws(struct packet_manager *pm); -int dqm_debugfs_execute_queues(struct device_queue_manager *dqm); +int dqm_debugfs_hang_hws(struct device_queue_manager *dqm); #else -- 2.7.4 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx