On Wed, Nov 3, 2021 at 11:05 AM shaoyunl <shaoyun.liu@xxxxxxx> wrote: > > When kfd need to be reset, sent command to HWS might cause hang and get unnecessary timeout. > This change try not to touch HW in pre_reset and keep queues to be in the evicted state > when the reset is done, so they are not put back on the runlist. These queues will be destroied > on process termination. > > Signed-off-by: shaoyunl <shaoyun.liu@xxxxxxx> > --- > drivers/gpu/drm/amd/amdkfd/kfd_device.c | 6 +++++- > drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 +- > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 ++ > drivers/gpu/drm/amd/amdkfd/kfd_process.c | 6 +++++- > 4 files changed, 13 insertions(+), 3 deletions(-) > mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/kfd_device.c > mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/kfd_priv.h > mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/kfd_process.c Please fix the mode change. Alex > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > old mode 100644 > new mode 100755 > index c8aade17efef..536ef766d09e > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > @@ -1100,6 +1100,8 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) > if (!kfd->init_complete) > return 0; > > + kfd->is_resetting = true; > + > kfd_smi_event_update_gpu_reset(kfd, false); > > kfd->dqm->ops.pre_reset(kfd->dqm); > @@ -1132,6 +1134,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) > > kfd_smi_event_update_gpu_reset(kfd, true); > > + kfd->is_resetting = false; > + > return 0; > } > > @@ -1168,7 +1172,7 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) > return ret; > > /* for runtime resume, skip unlocking kfd */ > - if (!run_pm) { > + if (!run_pm && !kfd->is_resetting) { > count = atomic_dec_return(&kfd_locked); > WARN_ONCE(count < 0, "KFD suspend / resume ref. error"); > if (count == 0) > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > old mode 100644 > new mode 100755 > index e9601d4dfb77..0a60317509c8 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -1430,7 +1430,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, > > if (!dqm->sched_running) > return 0; > - if (dqm->is_hws_hang) > + if (dqm->is_hws_hang || dqm->is_resetting) > return -EIO; > if (!dqm->active_runlist) > return retval; > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > old mode 100644 > new mode 100755 > index bfe7bacccb73..e4bcc2a09ca8 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -275,6 +275,8 @@ struct kfd_dev { > struct device_queue_manager *dqm; > > bool init_complete; > + bool is_resetting; > + > /* > * Interrupts of interest to KFD are copied > * from the HW ring into a SW ring. > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > old mode 100644 > new mode 100755 > index f8a8fdb95832..f29b3932e3dc > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > @@ -1715,7 +1715,11 @@ int kfd_process_evict_queues(struct kfd_process *p) > > r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm, > &pdd->qpd); > - if (r) { > + /* evict return -EIO if HWS is hang or asic is resetting, in this case > + * we would like to set all the queues to be in evicted state to prevent > + * them been add back since they actually not be saved right now. > + */ > + if (r && r != -EIO) { > pr_err("Failed to evict process queues\n"); > goto fail; > } > -- > 2.17.1 >