start_cpsch and stop_cpsch can be called during kfd device initialization or during gpu reset/recovery. So they can run concurrently. Currently in start_cpsch and stop_cpsch, pm_init and pm_uninit is not protected by the dpm lock. Imagine such a case that user use packet manager's function to submit a pm4 packet to hang hws (ie through command cat /sys/class/kfd/kfd/topology/nodes/1/gpu_id | sudo tee /sys/kernel/debug/kfd/hang_hws), while kfd device is under device reset/recovery so packet manager can be not initialized. There will be unpredictable protection fault in such case. This patch moves pm_init/uninit inside the dpm lock and check packet manager is initialized before using packet manager function. Signed-off-by: Oak Zeng <Oak.Zeng@xxxxxxx> --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 8 +------- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 10 ++++++++-- drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 3 +++ 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index c51402b..adc2342 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -1383,18 +1383,12 @@ void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask) */ int kfd_debugfs_hang_hws(struct kfd_dev *dev) { - int r = 0; - if (dev->dqm->sched_policy != KFD_SCHED_POLICY_HWS) { pr_err("HWS is not enabled"); return -EINVAL; } - r = pm_debugfs_hang_hws(&dev->dqm->dpm); - if (!r) - r = dqm_debugfs_execute_queues(dev->dqm); - - return r; + return dqm_debugfs_execute_queues(dev->dqm); } #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index f2984d3..44136d2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -1164,6 +1164,7 @@ static int start_cpsch(struct device_queue_manager *dqm) retval = 0; + dqm_lock(dqm); retval = pm_init(&dqm->dpm, dqm); if (retval) goto fail_packet_manager_init; @@ -1186,7 +1187,6 @@ static int start_cpsch(struct device_queue_manager *dqm) init_interrupts(dqm); - dqm_lock(dqm); /* clear hang status when driver try to start the hw scheduler */ dqm->is_hws_hang = false; dqm->is_resetting = false; @@ -1199,6 +1199,7 @@ static int start_cpsch(struct device_queue_manager *dqm) fail_set_sched_resources: pm_uninit(&dqm->dpm, false); fail_packet_manager_init: + dqm_unlock(dqm); return retval; } @@ -1211,12 +1212,12 @@ static int stop_cpsch(struct device_queue_manager *dqm) unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); hanging = dqm->is_hws_hang || dqm->is_resetting; dqm->sched_running = false; - dqm_unlock(dqm); pm_release_ib(&dqm->dpm); kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); pm_uninit(&dqm->dpm, hanging); + dqm_unlock(dqm); return 0; } @@ -2104,6 +2105,11 @@ int dqm_debugfs_execute_queues(struct device_queue_manager *dqm) int r = 0; dqm_lock(dqm); + r = pm_debugfs_hang_hws(&dqm->dpm); + if (r) { + dqm_unlock(dqm); + return r; + } dqm->active_runlist = true; r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); dqm_unlock(dqm); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index b130cc0..0123e64 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -448,6 +448,9 @@ int pm_debugfs_hang_hws(struct packet_manager *pm) uint32_t *buffer, size; int r = 0; + if (!pm->priv_queue) + return -EBUSY; + size = pm->pmf->query_status_size; mutex_lock(&pm->lock); kq_acquire_packet_buffer(pm->priv_queue, -- 2.7.4 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx