Patches 3 and 5 are Reviewed-by: Felix Kuehling <Felix.Kuehling@xxxxxxx> Am 2021-07-16 um 11:36 a.m. schrieb Oak Zeng: > start_cpsch and stop_cpsch can be called during kfd device > initialization or during gpu reset/recovery. So they can > run concurrently. Currently in start_cpsch and stop_cpsch, > pm_init and pm_uninit is not protected by the dpm lock. > Imagine such a case that user use packet manager's function > to submit a pm4 packet to hang hws (ie through command > cat /sys/class/kfd/kfd/topology/nodes/1/gpu_id | sudo tee > /sys/kernel/debug/kfd/hang_hws), while kfd device is under > device reset/recovery so packet manager can be not initialized. > There will be unpredictable protection fault in such case. > > This patch moves pm_init/uninit inside the dpm lock and check > packet manager is initialized before using packet manager > function. > > Signed-off-by: Oak Zeng <Oak.Zeng@xxxxxxx> > Acked-by: Christian Konig <christian.koenig@xxxxxxx> > --- > drivers/gpu/drm/amd/amdkfd/kfd_device.c | 8 +------- > drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 12 +++++++++--- > drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 3 +++ > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- > 4 files changed, 14 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > index 03875d2..56b5010 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > @@ -1383,18 +1383,12 @@ void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask) > */ > int kfd_debugfs_hang_hws(struct kfd_dev *dev) > { > - int r = 0; > - > if (dev->dqm->sched_policy != KFD_SCHED_POLICY_HWS) { > pr_err("HWS is not enabled"); > return -EINVAL; > } > > - r = pm_debugfs_hang_hws(&dev->dqm->packet_mgr); > - if (!r) > - r = dqm_debugfs_execute_queues(dev->dqm); > - > - return r; > + return dqm_debugfs_hang_hws(dev->dqm); > } > > #endif > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index 6b2f594..6b89ca6 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -1164,6 +1164,7 @@ static int start_cpsch(struct device_queue_manager *dqm) > > retval = 0; > > + dqm_lock(dqm); > retval = pm_init(&dqm->packet_mgr, dqm); > if (retval) > goto fail_packet_manager_init; > @@ -1186,7 +1187,6 @@ static int start_cpsch(struct device_queue_manager *dqm) > > init_interrupts(dqm); > > - dqm_lock(dqm); > /* clear hang status when driver try to start the hw scheduler */ > dqm->is_hws_hang = false; > dqm->is_resetting = false; > @@ -1199,6 +1199,7 @@ static int start_cpsch(struct device_queue_manager *dqm) > fail_set_sched_resources: > pm_uninit(&dqm->packet_mgr, false); > fail_packet_manager_init: > + dqm_unlock(dqm); > return retval; > } > > @@ -1211,12 +1212,12 @@ static int stop_cpsch(struct device_queue_manager *dqm) > unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); > hanging = dqm->is_hws_hang || dqm->is_resetting; > dqm->sched_running = false; > - dqm_unlock(dqm); > > pm_release_ib(&dqm->packet_mgr); > > kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); > pm_uninit(&dqm->packet_mgr, hanging); > + dqm_unlock(dqm); > > return 0; > } > @@ -2099,11 +2100,16 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data) > return r; > } > > -int dqm_debugfs_execute_queues(struct device_queue_manager *dqm) > +int dqm_debugfs_hang_hws(struct device_queue_manager *dqm) > { > int r = 0; > > dqm_lock(dqm); > + r = pm_debugfs_hang_hws(&dqm->packet_mgr); > + if (r) { > + dqm_unlock(dqm); > + return r; > + } > dqm->active_runlist = true; > r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); > dqm_unlock(dqm); > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c > index b130cc0..b33ebe8 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c > @@ -448,6 +448,9 @@ int pm_debugfs_hang_hws(struct packet_manager *pm) > uint32_t *buffer, size; > int r = 0; > > + if (!pm->priv_queue) > + return -EAGAIN; > + > size = pm->pmf->query_status_size; > mutex_lock(&pm->lock); > kq_acquire_packet_buffer(pm->priv_queue, > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index 3426743..8a5dfda 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -1194,7 +1194,7 @@ int pm_debugfs_runlist(struct seq_file *m, void *data); > > int kfd_debugfs_hang_hws(struct kfd_dev *dev); > int pm_debugfs_hang_hws(struct packet_manager *pm); > -int dqm_debugfs_execute_queues(struct device_queue_manager *dqm); > +int dqm_debugfs_hang_hws(struct device_queue_manager *dqm); > > #else > _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx