On Sat, Sep 16, 2017 at 2:42 AM, Felix Kuehling <Felix.Kuehling at amd.com> wrote: > From: Yong Zhao <yong.zhao at amd.com> > > When we do suspend/resume through "sudo pm-suspend" while there is > HSA activity running, upon resume we will encounter HWS hanging, which > is caused by memory read/write failures. The root cause is that when > suspend, we neglected to unbind pasid from kfd device. > > Another major change is that the bind/unbinding is changed to be > performed on a per process basis, instead of whether there are queues > in dqm. > > Signed-off-by: Yong Zhao <yong.zhao at amd.com> > Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com> > --- > drivers/gpu/drm/amd/amdkfd/kfd_device.c | 22 ++++-- > .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 13 ---- > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 15 +++- > drivers/gpu/drm/amd/amdkfd/kfd_process.c | 89 ++++++++++++++++++---- > 4 files changed, 101 insertions(+), 38 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > index cc8af11..ff3f97c 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > @@ -191,7 +191,7 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid) > struct kfd_dev *dev = kfd_device_by_pci_dev(pdev); > > if (dev) > - kfd_unbind_process_from_device(dev, pasid); > + kfd_process_iommu_unbind_callback(dev, pasid); > } > > /* > @@ -339,12 +339,16 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) > > void kgd2kfd_suspend(struct kfd_dev *kfd) > { > - if (kfd->init_complete) { > - kfd->dqm->ops.stop(kfd->dqm); > - amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); > - amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); > - amd_iommu_free_device(kfd->pdev); > - } > + if (!kfd->init_complete) > + return; > + > + kfd->dqm->ops.stop(kfd->dqm); > + > + kfd_unbind_processes_from_device(kfd); > + > + amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); > + amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); > + amd_iommu_free_device(kfd->pdev); > } > > int kgd2kfd_resume(struct kfd_dev *kfd) > @@ -369,6 +373,10 @@ static int kfd_resume(struct kfd_dev *kfd) > amd_iommu_set_invalid_ppr_cb(kfd->pdev, > iommu_invalid_ppr_cb); > > + err = kfd_bind_processes_to_device(kfd); > + if (err) > + return -ENXIO; You need to undo previous initialization in case kfd_bind_processes_to_device fails, i.e. call amd_iommu_free_device() > + > err = kfd->dqm->ops.start(kfd->dqm); > if (err) { > dev_err(kfd_device, > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index 53a66e8..5db82b8 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -670,7 +670,6 @@ static int initialize_cpsch(struct device_queue_manager *dqm) > > static int start_cpsch(struct device_queue_manager *dqm) > { > - struct device_process_node *node; > int retval; > > retval = 0; > @@ -697,11 +696,6 @@ static int start_cpsch(struct device_queue_manager *dqm) > > init_interrupts(dqm); > > - list_for_each_entry(node, &dqm->queues, list) > - if (node->qpd->pqm->process && dqm->dev) > - kfd_bind_process_to_device(dqm->dev, > - node->qpd->pqm->process); > - > execute_queues_cpsch(dqm, true); > > return 0; > @@ -714,15 +708,8 @@ static int start_cpsch(struct device_queue_manager *dqm) > > static int stop_cpsch(struct device_queue_manager *dqm) > { > - struct device_process_node *node; > - struct kfd_process_device *pdd; > - > destroy_queues_cpsch(dqm, true, true); > > - list_for_each_entry(node, &dqm->queues, list) { > - pdd = qpd_to_pdd(node->qpd); > - pdd->bound = false; > - } > kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); > pm_uninit(&dqm->packets); > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index b397ec7..ef582cc 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -435,6 +435,13 @@ struct qcm_process_device { > uint32_t sh_hidden_private_base; > }; > > + > +enum kfd_pdd_bound { > + PDD_UNBOUND = 0, > + PDD_BOUND, > + PDD_BOUND_SUSPENDED, > +}; > + > /* Data that is per-process-per device. */ > struct kfd_process_device { > /* > @@ -459,7 +466,7 @@ struct kfd_process_device { > uint64_t scratch_limit; > > /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */ > - bool bound; > + enum kfd_pdd_bound bound; > > /* This flag tells if we should reset all > * wavefronts on process termination > @@ -548,8 +555,10 @@ struct kfd_process *kfd_get_process(const struct task_struct *); > struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid); > > struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, > - struct kfd_process *p); > -void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid); > + struct kfd_process *p); > +int kfd_bind_processes_to_device(struct kfd_dev *dev); > +void kfd_unbind_processes_from_device(struct kfd_dev *dev); > +void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid); > struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, > struct kfd_process *p); > struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > index c74cf22..3ffaac3 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > @@ -174,7 +174,10 @@ static void kfd_process_wq_release(struct work_struct *work) > if (pdd->reset_wavefronts) > dbgdev_wave_reset_wavefronts(pdd->dev, p); > > - amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid); > + if (pdd->bound == PDD_BOUND) { > + amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid); > + pdd->bound = PDD_UNBOUND; I think setting the bound property here is redundant because we do kfree(pdd) in two more statements. > + } > list_del(&pdd->per_device_list); > > kfree(pdd); > @@ -345,9 +348,9 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, > > list_for_each_entry(pdd, &p->per_device_data, per_device_list) > if (pdd->dev == dev) > - break; > + return pdd; > > - return pdd; > + return NULL; > } > > struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, > @@ -362,6 +365,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, > INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); > pdd->qpd.dqm = dev->dqm; > pdd->reset_wavefronts = false; > + pdd->bound = PDD_UNBOUND; > list_add(&pdd->per_device_list, &p->per_device_data); > } > > @@ -387,19 +391,83 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, > return ERR_PTR(-ENOMEM); > } > > - if (pdd->bound) > + if (pdd->bound == PDD_BOUND) > return pdd; > > + if (pdd->bound == PDD_BOUND_SUSPENDED) { because we check the same variable, we should do it as "else if" > + pr_err("Binding PDD_BOUND_SUSPENDED pdd is unexpected!\n"); > + return ERR_PTR(-EINVAL); > + } > + > err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread); > if (err < 0) > return ERR_PTR(err); > > - pdd->bound = true; > + pdd->bound = PDD_BOUND; > > return pdd; > } > > -void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid) > +int kfd_bind_processes_to_device(struct kfd_dev *dev) > +{ > + struct kfd_process_device *pdd; > + struct kfd_process *p; > + unsigned int temp; > + int err = 0; > + > + int idx = srcu_read_lock(&kfd_processes_srcu); > + > + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { > + mutex_lock(&p->mutex); > + pdd = kfd_get_process_device_data(dev, p); > + if (pdd->bound != PDD_BOUND_SUSPENDED) { > + mutex_unlock(&p->mutex); > + continue; > + } > + > + err = amd_iommu_bind_pasid(dev->pdev, p->pasid, > + p->lead_thread); > + if (err < 0) { > + pr_err("unexpected pasid %d binding failure\n", > + p->pasid); > + mutex_unlock(&p->mutex); > + break; > + } > + > + pdd->bound = PDD_BOUND; > + mutex_unlock(&p->mutex); > + } > + > + srcu_read_unlock(&kfd_processes_srcu, idx); > + > + return err; > +} > + > +void kfd_unbind_processes_from_device(struct kfd_dev *dev) > +{ > + struct kfd_process_device *pdd; > + struct kfd_process *p; > + unsigned int temp, temp_bound, temp_pasid; > + > + int idx = srcu_read_lock(&kfd_processes_srcu); > + > + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { > + mutex_lock(&p->mutex); > + pdd = kfd_get_process_device_data(dev, p); > + temp_bound = pdd->bound; > + temp_pasid = p->pasid; > + if (pdd->bound == PDD_BOUND) > + pdd->bound = PDD_BOUND_SUSPENDED; I would add a comment here explaining why we set pdd->bound to PDD_BOUND_SUSPENDED and not to PDD_UNBOUND. It took me a couple of minutes to traces all the paths in the code in order to understand it. > + mutex_unlock(&p->mutex); > + > + if (temp_bound == PDD_BOUND) > + amd_iommu_unbind_pasid(dev->pdev, temp_pasid); > + } > + > + srcu_read_unlock(&kfd_processes_srcu, idx); > +} > + > +void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid) > { > struct kfd_process *p; > struct kfd_process_device *pdd; > @@ -432,15 +500,6 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid) > pdd->reset_wavefronts = false; > } > > - /* > - * Just mark pdd as unbound, because we still need it > - * to call amd_iommu_unbind_pasid() in when the > - * process exits. > - * We don't call amd_iommu_unbind_pasid() here > - * because the IOMMU called us. > - */ > - pdd->bound = false; > - > mutex_unlock(&p->mutex); > } > > -- > 2.7.4 > > _______________________________________________ > amd-gfx mailing list > amd-gfx at lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx With the above minor comments fixed, this patch is: Reviewed-by: Oded Gabbay <oded.gabbay at gmail.com>