m 2021-08-03 um 2:57 p.m. schrieb Eric Huang: > It is to differenciate case scenario for proper behavior when > calling evict queues, such as GPU reset doesn't need to roll > back restoring partial evicted queues. > > Signed-off-by: Eric Huang <jinhuieric.huang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 4 ++-- > drivers/gpu/drm/amd/amdkfd/kfd_device.c | 10 +++++----- > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 ++-- > drivers/gpu/drm/amd/amdkfd/kfd_process.c | 18 ++++++++++-------- > 5 files changed, 20 insertions(+), 18 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > index 77044e8ba4e6..59ce5a17a834 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > @@ -190,7 +190,7 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev, > void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm) > { > if (adev->kfd.dev) > - kgd2kfd_suspend(adev->kfd.dev, run_pm); > + kgd2kfd_suspend(adev->kfd.dev, run_pm, false); If suspend fails, this should return an error that should be handled in amdgpu_device_suspend. Maybe this could be fixed in a follow up patch. This means kgd2kfd_suspend and kfd_suspend_all_processes should not return void and return an error code on failures at least if force=false. Otherwise this patch is Reviewed-by: Felix Kuehling <Felix.Kuehling@xxxxxxx> > } > > int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm, bool sync) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > index 332ccba00e69..b7e46ad0507e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > @@ -372,7 +372,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, > struct drm_device *ddev, > const struct kgd2kfd_shared_resources *gpu_resources); > void kgd2kfd_device_exit(struct kfd_dev *kfd); > -void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm); > +void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm, bool force); > int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm, bool sync); > int kgd2kfd_pre_reset(struct kfd_dev *kfd); > int kgd2kfd_post_reset(struct kfd_dev *kfd); > @@ -407,7 +407,7 @@ static inline void kgd2kfd_device_exit(struct kfd_dev *kfd) > { > } > > -static inline void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) > +static inline void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm, bool force) > { > } > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > index 24b5e0aa1eac..48e51ee8de56 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > @@ -940,7 +940,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, > void kgd2kfd_device_exit(struct kfd_dev *kfd) > { > if (kfd->init_complete) { > - kgd2kfd_suspend(kfd, false); > + kgd2kfd_suspend(kfd, false, true); > svm_migrate_fini((struct amdgpu_device *)kfd->kgd); > device_queue_manager_uninit(kfd->dqm); > kfd_interrupt_exit(kfd); > @@ -965,7 +965,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) > > kfd->dqm->ops.pre_reset(kfd->dqm); > > - kgd2kfd_suspend(kfd, false); > + kgd2kfd_suspend(kfd, false, true); > > kfd_signal_reset_event(kfd); > return 0; > @@ -1001,7 +1001,7 @@ bool kfd_is_locked(void) > return (atomic_read(&kfd_locked) > 0); > } > > -void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) > +void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm, bool force) > { > if (!kfd->init_complete) > return; > @@ -1010,7 +1010,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) > if (!run_pm) { > /* For first KFD device suspend all the KFD processes */ > if (atomic_inc_return(&kfd_locked) == 1) > - kfd_suspend_all_processes(); > + kfd_suspend_all_processes(force); > } > > kfd->dqm->ops.stop(kfd->dqm); > @@ -1122,7 +1122,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm) > return -ESRCH; > > WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid); > - r = kfd_process_evict_queues(p); > + r = kfd_process_evict_queues(p, true); > > kfd_unref_process(p); > return r; > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index 3d5d3994d8a4..e80fb64a6dcc 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -1042,9 +1042,9 @@ static inline struct kfd_process_device *kfd_process_device_from_gpuidx( > } > > void kfd_unref_process(struct kfd_process *p); > -int kfd_process_evict_queues(struct kfd_process *p); > +int kfd_process_evict_queues(struct kfd_process *p, bool force); > int kfd_process_restore_queues(struct kfd_process *p); > -void kfd_suspend_all_processes(void); > +void kfd_suspend_all_processes(bool force); > /* > * kfd_resume_all_processes: > * bool sync: If kfd_resume_all_processes() should wait for the > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > index 38a9dee40785..a41ece37bc3c 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > @@ -1875,20 +1875,22 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm) > * Eviction is reference-counted per process-device. This means multiple > * evictions from different sources can be nested safely. > */ > -int kfd_process_evict_queues(struct kfd_process *p) > +int kfd_process_evict_queues(struct kfd_process *p, bool force) > { > - int r = 0; > + int r = 0, r_tmp = 0; > int i; > unsigned int n_evicted = 0; > > for (i = 0; i < p->n_pdds; i++) { > struct kfd_process_device *pdd = p->pdds[i]; > > - r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm, > + r_tmp = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm, > &pdd->qpd); > - if (r) { > + if (r_tmp) { > pr_err("Failed to evict process queues\n"); > - goto fail; > + r = r_tmp; > + if (!force) > + goto fail; > } > n_evicted++; > } > @@ -2079,7 +2081,7 @@ static void evict_process_worker(struct work_struct *work) > p->last_evict_timestamp = get_jiffies_64(); > > pr_debug("Started evicting pasid 0x%x\n", p->pasid); > - ret = kfd_process_evict_queues(p); > + ret = kfd_process_evict_queues(p, false); > if (!ret) { > dma_fence_signal(p->ef); > dma_fence_put(p->ef); > @@ -2147,7 +2149,7 @@ static void restore_process_worker(struct work_struct *work) > pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid); > } > > -void kfd_suspend_all_processes(void) > +void kfd_suspend_all_processes(bool force) > { > struct kfd_process *p; > unsigned int temp; > @@ -2158,7 +2160,7 @@ void kfd_suspend_all_processes(void) > cancel_delayed_work_sync(&p->eviction_work); > cancel_delayed_work_sync(&p->restore_work); > > - if (kfd_process_evict_queues(p)) > + if (kfd_process_evict_queues(p, force)) > pr_err("Failed to suspend process 0x%x\n", p->pasid); > dma_fence_signal(p->ef); > dma_fence_put(p->ef);