On Fri, 11 May 2018 20:06:06 +0100 Jean-Philippe Brucker <jean-philippe.brucker@xxxxxxx> wrote: > When creating an io_mm structure, register an MMU notifier that informs > us when the virtual address space changes and disappears. > > Add a new operation to the IOMMU driver, mm_invalidate, called when a > range of addresses is unmapped to let the IOMMU driver send ATC > invalidations. mm_invalidate cannot sleep. > > Adding the notifier complicates io_mm release. In one case device > drivers free the io_mm explicitly by calling unbind (or detaching the > device from its domain). In the other case the process could crash > before unbind, in which case the release notifier has to do all the > work. > > Allowing the device driver's mm_exit() handler to sleep adds another > complication, but it will greatly simplify things for users. For example > VFIO can take the IOMMU mutex and remove any trace of io_mm, instead of > introducing complex synchronization to delicatly handle this race. But > relaxing the user side does force unbind() to sleep and wait for all > pending mm_exit() calls to finish. > > Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@xxxxxxx> > > --- > v1->v2: > * Unbind() waits for mm_exit to finish > * mm_exit can sleep > --- > drivers/iommu/Kconfig | 1 + > drivers/iommu/iommu-sva.c | 248 +++++++++++++++++++++++++++++++++++--- > include/linux/iommu.h | 10 ++ > 3 files changed, 244 insertions(+), 15 deletions(-) > > diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig > index cca8e06903c7..38434899e283 100644 > --- a/drivers/iommu/Kconfig > +++ b/drivers/iommu/Kconfig > @@ -77,6 +77,7 @@ config IOMMU_DMA > config IOMMU_SVA > bool > select IOMMU_API > + select MMU_NOTIFIER > > config FSL_PAMU > bool "Freescale IOMMU support" > diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c > index 0700893c679d..e9afae2537a2 100644 > --- a/drivers/iommu/iommu-sva.c > +++ b/drivers/iommu/iommu-sva.c > @@ -7,6 +7,7 @@ > > #include <linux/idr.h> > #include <linux/iommu.h> > +#include <linux/mmu_notifier.h> > #include <linux/sched/mm.h> > #include <linux/slab.h> > #include <linux/spinlock.h> > @@ -106,6 +107,9 @@ struct iommu_bond { > struct list_head mm_head; > struct list_head dev_head; > struct list_head domain_head; > + refcount_t refs; > + struct wait_queue_head mm_exit_wq; > + bool mm_exit_active; > > void *drvdata; > }; > @@ -124,6 +128,8 @@ static DEFINE_IDR(iommu_pasid_idr); > */ > static DEFINE_SPINLOCK(iommu_sva_lock); > > +static struct mmu_notifier_ops iommu_mmu_notifier; > + > static struct io_mm * > io_mm_alloc(struct iommu_domain *domain, struct device *dev, > struct mm_struct *mm, unsigned long flags) > @@ -151,6 +157,7 @@ io_mm_alloc(struct iommu_domain *domain, struct device *dev, > > io_mm->flags = flags; > io_mm->mm = mm; > + io_mm->notifier.ops = &iommu_mmu_notifier; > io_mm->release = domain->ops->mm_free; > INIT_LIST_HEAD(&io_mm->devices); > > @@ -167,8 +174,29 @@ io_mm_alloc(struct iommu_domain *domain, struct device *dev, > goto err_free_mm; > } > > - /* TODO: keep track of mm. For the moment, abort. */ > - ret = -ENOSYS; > + ret = mmu_notifier_register(&io_mm->notifier, mm); > + if (ret) > + goto err_free_pasid; > + > + /* > + * Now that the MMU notifier is valid, we can allow users to grab this > + * io_mm by setting a valid refcount. Before that it was accessible in > + * the IDR but invalid. > + * > + * The following barrier ensures that users, who obtain the io_mm with > + * kref_get_unless_zero, don't read uninitialized fields in the > + * structure. > + */ > + smp_wmb(); > + kref_init(&io_mm->kref); > + > + return io_mm; > + > +err_free_pasid: > + /* > + * Even if the io_mm is accessible from the IDR at this point, kref is > + * 0 so no user could get a reference to it. Free it manually. > + */ > spin_lock(&iommu_sva_lock); > idr_remove(&iommu_pasid_idr, io_mm->pasid); > spin_unlock(&iommu_sva_lock); > @@ -180,9 +208,13 @@ io_mm_alloc(struct iommu_domain *domain, struct device *dev, > return ERR_PTR(ret); > } > > -static void io_mm_free(struct io_mm *io_mm) > +static void io_mm_free(struct rcu_head *rcu) > { > - struct mm_struct *mm = io_mm->mm; > + struct io_mm *io_mm; > + struct mm_struct *mm; > + > + io_mm = container_of(rcu, struct io_mm, rcu); > + mm = io_mm->mm; > > io_mm->release(io_mm); > mmdrop(mm); > @@ -197,7 +229,22 @@ static void io_mm_release(struct kref *kref) > > idr_remove(&iommu_pasid_idr, io_mm->pasid); > > - io_mm_free(io_mm); > + /* > + * If we're being released from mm exit, the notifier callback ->release > + * has already been called. Otherwise we don't need ->release, the io_mm > + * isn't attached to anything anymore. Hence no_release. > + */ > + mmu_notifier_unregister_no_release(&io_mm->notifier, io_mm->mm); > + > + /* > + * We can't free the structure here, because if mm exits during > + * unbind(), then ->release might be attempting to grab the io_mm > + * concurrently. And in the other case, if ->release is calling > + * io_mm_release, then __mmu_notifier_release expects to still have a > + * valid mn when returning. So free the structure when it's safe, after > + * the RCU grace period elapsed. > + */ > + mmu_notifier_call_srcu(&io_mm->rcu, io_mm_free); > } > > /* > @@ -206,8 +253,14 @@ static void io_mm_release(struct kref *kref) > */ > static int io_mm_get_locked(struct io_mm *io_mm) > { > - if (io_mm) > - return kref_get_unless_zero(&io_mm->kref); > + if (io_mm && kref_get_unless_zero(&io_mm->kref)) { > + /* > + * kref_get_unless_zero doesn't provide ordering for reads. This > + * barrier pairs with the one in io_mm_alloc. > + */ > + smp_rmb(); > + return 1; > + } > > return 0; > } > @@ -233,7 +286,8 @@ static int io_mm_attach(struct iommu_domain *domain, struct device *dev, > struct iommu_bond *bond, *tmp; > struct iommu_sva_param *param = dev->iommu_param->sva_param; > > - if (!domain->ops->mm_attach || !domain->ops->mm_detach) > + if (!domain->ops->mm_attach || !domain->ops->mm_detach || > + !domain->ops->mm_invalidate) > return -ENODEV; > > if (pasid > param->max_pasid || pasid < param->min_pasid) > @@ -247,6 +301,8 @@ static int io_mm_attach(struct iommu_domain *domain, struct device *dev, > bond->io_mm = io_mm; > bond->dev = dev; > bond->drvdata = drvdata; > + refcount_set(&bond->refs, 1); > + init_waitqueue_head(&bond->mm_exit_wq); > > spin_lock(&iommu_sva_lock); > /* > @@ -275,12 +331,37 @@ static int io_mm_attach(struct iommu_domain *domain, struct device *dev, > return 0; > } > > -static void io_mm_detach_locked(struct iommu_bond *bond) > +static void io_mm_detach_locked(struct iommu_bond *bond, bool wait) > { > struct iommu_bond *tmp; > bool detach_domain = true; > struct iommu_domain *domain = bond->domain; > > + if (wait) { > + bool do_detach = true; > + /* > + * If we're unbind() then we're deleting the bond no matter > + * what. Tell the mm_exit thread that we're cleaning up, and > + * wait until it finishes using the bond. > + * > + * refs is guaranteed to be one or more, otherwise it would > + * already have been removed from the list. Check is someone is Check if someone... > + * already waiting, in which case we wait but do not free. > + */ > + if (refcount_read(&bond->refs) > 1) > + do_detach = false; > + > + refcount_inc(&bond->refs); > + wait_event_lock_irq(bond->mm_exit_wq, !bond->mm_exit_active, > + iommu_sva_lock); > + if (!do_detach) > + return; > + > + } else if (!refcount_dec_and_test(&bond->refs)) { > + /* unbind() is waiting to free the bond */ > + return; > + } > + > list_for_each_entry(tmp, &domain->mm_list, domain_head) { > if (tmp->io_mm == bond->io_mm && tmp->dev != bond->dev) { > detach_domain = false; > @@ -298,6 +379,129 @@ static void io_mm_detach_locked(struct iommu_bond *bond) > kfree(bond); > } > > +static int iommu_signal_mm_exit(struct iommu_bond *bond) > +{ > + struct device *dev = bond->dev; > + struct io_mm *io_mm = bond->io_mm; > + struct iommu_sva_param *param = dev->iommu_param->sva_param; > + > + /* > + * We can't hold the device's param_lock. If we did and the device > + * driver used a global lock around io_mm, we would risk getting the > + * following deadlock: > + * > + * exit_mm() | Shutdown SVA > + * mutex_lock(param->lock) | mutex_lock(glob lock) > + * param->mm_exit() | sva_device_shutdown() > + * mutex_lock(glob lock) | mutex_lock(param->lock) > + * > + * Fortunately unbind() waits for us to finish, and sva_device_shutdown > + * requires that any bond is removed, so we can safely access mm_exit > + * and drvdata without taking any lock. > + */ > + if (!param || !param->mm_exit) > + return 0; > + > + return param->mm_exit(dev, io_mm->pasid, bond->drvdata); > +} > + > +/* Called when the mm exits. Can race with unbind(). */ > +static void iommu_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) > +{ > + struct iommu_bond *bond, *next; > + struct io_mm *io_mm = container_of(mn, struct io_mm, notifier); > + > + /* > + * If the mm is exiting then devices are still bound to the io_mm. > + * A few things need to be done before it is safe to release: > + * > + * - As the mmu notifier doesn't hold any reference to the io_mm when > + * calling ->release(), try to take a reference. > + * - Tell the device driver to stop using this PASID. > + * - Clear the PASID table and invalidate TLBs. > + * - Drop all references to this io_mm by freeing the bonds. > + */ > + spin_lock(&iommu_sva_lock); > + if (!io_mm_get_locked(io_mm)) { > + /* Someone's already taking care of it. */ > + spin_unlock(&iommu_sva_lock); > + return; > + } > + > + list_for_each_entry_safe(bond, next, &io_mm->devices, mm_head) { > + /* > + * Release the lock to let the handler sleep. We need to be > + * careful about concurrent modifications to the list and to the > + * bond. Tell unbind() not to free the bond until we're done. > + */ > + bond->mm_exit_active = true; > + spin_unlock(&iommu_sva_lock); > + > + if (iommu_signal_mm_exit(bond)) > + dev_WARN(bond->dev, "possible leak of PASID %u", > + io_mm->pasid); > + > + spin_lock(&iommu_sva_lock); > + next = list_next_entry(bond, mm_head); > + > + /* If someone is waiting, let them delete the bond now */ > + bond->mm_exit_active = false; > + wake_up_all(&bond->mm_exit_wq); > + > + /* Otherwise, do it ourselves */ > + io_mm_detach_locked(bond, false); > + } > + spin_unlock(&iommu_sva_lock); > + > + /* > + * We're now reasonably certain that no more fault is being handled for > + * this io_mm, since we just flushed them all out of the fault queue. > + * Release the last reference to free the io_mm. > + */ > + io_mm_put(io_mm); > +} > + > +static void iommu_notifier_invalidate_range(struct mmu_notifier *mn, > + struct mm_struct *mm, > + unsigned long start, > + unsigned long end) > +{ > + struct iommu_bond *bond; > + struct io_mm *io_mm = container_of(mn, struct io_mm, notifier); > + > + spin_lock(&iommu_sva_lock); > + list_for_each_entry(bond, &io_mm->devices, mm_head) { > + struct iommu_domain *domain = bond->domain; > + > + domain->ops->mm_invalidate(domain, bond->dev, io_mm, start, > + end - start); > + } > + spin_unlock(&iommu_sva_lock); > +} > + > +static int iommu_notifier_clear_flush_young(struct mmu_notifier *mn, > + struct mm_struct *mm, > + unsigned long start, > + unsigned long end) > +{ > + iommu_notifier_invalidate_range(mn, mm, start, end); > + return 0; > +} > + > +static void iommu_notifier_change_pte(struct mmu_notifier *mn, > + struct mm_struct *mm, > + unsigned long address, pte_t pte) > +{ > + iommu_notifier_invalidate_range(mn, mm, address, address + PAGE_SIZE); > +} > + > +static struct mmu_notifier_ops iommu_mmu_notifier = { > + .release = iommu_notifier_release, > + .clear_flush_young = iommu_notifier_clear_flush_young, > + .change_pte = iommu_notifier_change_pte, > + .invalidate_range = iommu_notifier_invalidate_range, > +}; > + > /** > * iommu_sva_device_init() - Initialize Shared Virtual Addressing for a device > * @dev: the device > @@ -320,6 +524,12 @@ static void io_mm_detach_locked(struct iommu_bond *bond) > * The handler gets an opaque pointer corresponding to the drvdata passed as > * argument of bind(). > * > + * The @mm_exit handler is allowed to sleep. Be careful about the locks taken in > + * @mm_exit, because they might lead to deadlocks if they are also held when > + * dropping references to the mm. Consider the following call chain: > + * mutex_lock(A); mmput(mm) -> exit_mm() -> @mm_exit() -> mutex_lock(A) > + * Using mmput_async() prevents this scenario. > + * > * The device should not be performing any DMA while this function is running, > * otherwise the behavior is undefined. > * > @@ -484,15 +694,16 @@ int __iommu_sva_unbind_device(struct device *dev, int pasid) > if (!param || WARN_ON(!domain)) > return -EINVAL; > > - spin_lock(&iommu_sva_lock); > + /* spin_lock_irq matches the one in wait_event_lock_irq */ > + spin_lock_irq(&iommu_sva_lock); > list_for_each_entry(bond, ¶m->mm_list, dev_head) { > if (bond->io_mm->pasid == pasid) { > - io_mm_detach_locked(bond); > + io_mm_detach_locked(bond, true); > ret = 0; > break; > } > } > - spin_unlock(&iommu_sva_lock); > + spin_unlock_irq(&iommu_sva_lock); > > return ret; > } > @@ -503,18 +714,25 @@ EXPORT_SYMBOL_GPL(__iommu_sva_unbind_device); > * @dev: the device > * > * When detaching @device from a domain, IOMMU drivers should use this helper. > + * This function may sleep while waiting for bonds to be released. > */ > void __iommu_sva_unbind_dev_all(struct device *dev) > { > struct iommu_sva_param *param; > struct iommu_bond *bond, *next; > > + /* > + * io_mm_detach_locked might wait, so we shouldn't call it with the dev > + * param lock held. It's fine to read sva_param outside the lock because > + * it can only be freed by iommu_sva_device_shutdown when there are no > + * more bonds in the list. > + */ > param = dev->iommu_param->sva_param; > if (param) { > - spin_lock(&iommu_sva_lock); > + spin_lock_irq(&iommu_sva_lock); > list_for_each_entry_safe(bond, next, ¶m->mm_list, dev_head) > - io_mm_detach_locked(bond); > - spin_unlock(&iommu_sva_lock); > + io_mm_detach_locked(bond, true); > + spin_unlock_irq(&iommu_sva_lock); > } > } > EXPORT_SYMBOL_GPL(__iommu_sva_unbind_dev_all); > diff --git a/include/linux/iommu.h b/include/linux/iommu.h > index 439c8fffd836..caa6f79785b9 100644 > --- a/include/linux/iommu.h > +++ b/include/linux/iommu.h > @@ -24,6 +24,7 @@ > #include <linux/types.h> > #include <linux/errno.h> > #include <linux/err.h> > +#include <linux/mmu_notifier.h> > #include <linux/of.h> > #include <uapi/linux/iommu.h> > > @@ -111,10 +112,15 @@ struct io_mm { > unsigned long flags; > struct list_head devices; > struct kref kref; > +#if defined(CONFIG_MMU_NOTIFIER) > + struct mmu_notifier notifier; > +#endif > struct mm_struct *mm; > > /* Release callback for this mm */ > void (*release)(struct io_mm *io_mm); > + /* For postponed release */ > + struct rcu_head rcu; > }; > > enum iommu_cap { > @@ -249,6 +255,7 @@ struct iommu_sva_param { > * @mm_attach: attach io_mm to a device. Install PASID entry if necessary > * @mm_detach: detach io_mm from a device. Remove PASID entry and > * flush associated TLB entries. > + * @mm_invalidate: Invalidate a range of mappings for an mm > * @map: map a physically contiguous memory region to an iommu domain > * @unmap: unmap a physically contiguous memory region from an iommu domain > * @map_sg: map a scatter-gather list of physically contiguous memory chunks > @@ -298,6 +305,9 @@ struct iommu_ops { > struct io_mm *io_mm, bool attach_domain); > void (*mm_detach)(struct iommu_domain *domain, struct device *dev, > struct io_mm *io_mm, bool detach_domain); > + void (*mm_invalidate)(struct iommu_domain *domain, struct device *dev, > + struct io_mm *io_mm, unsigned long vaddr, > + size_t size); > int (*map)(struct iommu_domain *domain, unsigned long iova, > phys_addr_t paddr, size_t size, int prot); > size_t (*unmap)(struct iommu_domain *domain, unsigned long iova,