Am 2021-07-15 um 9:34 p.m. schrieb Oak Zeng: > start_cpsch and stop_cpsch can be called during kfd device > initialization or during gpu reset/recovery. So they can > run concurrently. Currently in start_cpsch and stop_cpsch, > pm_init and pm_uninit is not protected by the dpm lock. > Imagine such a case that user use packet manager's function > to submit a pm4 packet to hang hws (ie through command > cat /sys/class/kfd/kfd/topology/nodes/1/gpu_id | sudo tee > /sys/kernel/debug/kfd/hang_hws), while kfd device is under > device reset/recovery so packet manager can be not initialized. > There will be unpredictable protection fault in such case. > > This patch moves pm_init/uninit inside the dpm lock and check > packet manager is initialized before using packet manager > function. > > Signed-off-by: Oak Zeng <Oak.Zeng@xxxxxxx> > --- > drivers/gpu/drm/amd/amdkfd/kfd_device.c | 8 +------- > drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 10 ++++++++-- > drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 3 +++ > 3 files changed, 12 insertions(+), 9 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > index c51402b..adc2342 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > @@ -1383,18 +1383,12 @@ void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask) > */ > int kfd_debugfs_hang_hws(struct kfd_dev *dev) > { > - int r = 0; > - > if (dev->dqm->sched_policy != KFD_SCHED_POLICY_HWS) { > pr_err("HWS is not enabled"); > return -EINVAL; > } > > - r = pm_debugfs_hang_hws(&dev->dqm->dpm); > - if (!r) > - r = dqm_debugfs_execute_queues(dev->dqm); > - > - return r; > + return dqm_debugfs_execute_queues(dev->dqm); This function should now probably be renamed to something like dqm_debugfs_hang_hws. Other than that, this patch looks good to me. Regards, Felix > } > > #endif > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index f2984d3..44136d2 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -1164,6 +1164,7 @@ static int start_cpsch(struct device_queue_manager *dqm) > > retval = 0; > > + dqm_lock(dqm); > retval = pm_init(&dqm->dpm, dqm); > if (retval) > goto fail_packet_manager_init; > @@ -1186,7 +1187,6 @@ static int start_cpsch(struct device_queue_manager *dqm) > > init_interrupts(dqm); > > - dqm_lock(dqm); > /* clear hang status when driver try to start the hw scheduler */ > dqm->is_hws_hang = false; > dqm->is_resetting = false; > @@ -1199,6 +1199,7 @@ static int start_cpsch(struct device_queue_manager *dqm) > fail_set_sched_resources: > pm_uninit(&dqm->dpm, false); > fail_packet_manager_init: > + dqm_unlock(dqm); > return retval; > } > > @@ -1211,12 +1212,12 @@ static int stop_cpsch(struct device_queue_manager *dqm) > unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); > hanging = dqm->is_hws_hang || dqm->is_resetting; > dqm->sched_running = false; > - dqm_unlock(dqm); > > pm_release_ib(&dqm->dpm); > > kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); > pm_uninit(&dqm->dpm, hanging); > + dqm_unlock(dqm); > > return 0; > } > @@ -2104,6 +2105,11 @@ int dqm_debugfs_execute_queues(struct device_queue_manager *dqm) > int r = 0; > > dqm_lock(dqm); > + r = pm_debugfs_hang_hws(&dqm->dpm); > + if (r) { > + dqm_unlock(dqm); > + return r; > + } > dqm->active_runlist = true; > r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); > dqm_unlock(dqm); > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c > index b130cc0..0123e64 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c > @@ -448,6 +448,9 @@ int pm_debugfs_hang_hws(struct packet_manager *pm) > uint32_t *buffer, size; > int r = 0; > > + if (!pm->priv_queue) > + return -EBUSY; > + > size = pm->pmf->query_status_size; > mutex_lock(&pm->lock); > kq_acquire_packet_buffer(pm->priv_queue, _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx