On Fri, 11 May 2018 20:06:14 +0100 Jean-Philippe Brucker <jean-philippe.brucker@xxxxxxx> wrote: > Add two new ioctls for VFIO containers. VFIO_IOMMU_BIND_PROCESS creates a > bond between a container and a process address space, identified by a > Process Address Space ID (PASID). Devices in the container append this > PASID to DMA transactions in order to access the process' address space. > The process page tables are shared with the IOMMU, and mechanisms such as > PCI ATS/PRI are used to handle faults. VFIO_IOMMU_UNBIND_PROCESS removes a > bond created with VFIO_IOMMU_BIND_PROCESS. > > Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@xxxxxxx> A couple of small comments inline.. Jonathan > > --- > v1->v2: > * Simplify mm_exit > * Can be built as module again > --- > drivers/vfio/vfio_iommu_type1.c | 449 ++++++++++++++++++++++++++++++-- > include/uapi/linux/vfio.h | 76 ++++++ > 2 files changed, 504 insertions(+), 21 deletions(-) > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > index 5c212bf29640..2902774062b8 100644 > --- a/drivers/vfio/vfio_iommu_type1.c > +++ b/drivers/vfio/vfio_iommu_type1.c > @@ -30,6 +30,7 @@ > #include <linux/iommu.h> > #include <linux/module.h> > #include <linux/mm.h> > +#include <linux/ptrace.h> > #include <linux/rbtree.h> > #include <linux/sched/signal.h> > #include <linux/sched/mm.h> > @@ -60,6 +61,7 @@ MODULE_PARM_DESC(disable_hugepages, > > struct vfio_iommu { > struct list_head domain_list; > + struct list_head mm_list; > struct vfio_domain *external_domain; /* domain for external user */ > struct mutex lock; > struct rb_root dma_list; > @@ -90,6 +92,14 @@ struct vfio_dma { > struct vfio_group { > struct iommu_group *iommu_group; > struct list_head next; > + bool sva_enabled; > +}; > + > +struct vfio_mm { > +#define VFIO_PASID_INVALID (-1) > + int pasid; > + struct mm_struct *mm; > + struct list_head next; > }; > > /* > @@ -1238,6 +1248,164 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, > return 0; > } > > +static int vfio_iommu_mm_exit(struct device *dev, int pasid, void *data) > +{ > + struct vfio_mm *vfio_mm; > + struct vfio_iommu *iommu = data; > + > + mutex_lock(&iommu->lock); > + list_for_each_entry(vfio_mm, &iommu->mm_list, next) { > + if (vfio_mm->pasid == pasid) { > + list_del(&vfio_mm->next); > + kfree(vfio_mm); > + break; > + } > + } > + mutex_unlock(&iommu->lock); > + > + return 0; > +} > + > +static int vfio_iommu_sva_init(struct device *dev, void *data) > +{ > + return iommu_sva_device_init(dev, IOMMU_SVA_FEAT_IOPF, 0, > + vfio_iommu_mm_exit); > +} > + > +static int vfio_iommu_sva_shutdown(struct device *dev, void *data) > +{ > + iommu_sva_device_shutdown(dev); > + > + return 0; > +} > + > +struct vfio_iommu_sva_bind_data { > + struct vfio_mm *vfio_mm; > + struct vfio_iommu *iommu; > + int count; > +}; > + > +static int vfio_iommu_sva_bind_dev(struct device *dev, void *data) > +{ > + struct vfio_iommu_sva_bind_data *bind_data = data; > + > + /* Multi-device groups aren't support for SVA */ > + if (bind_data->count++) > + return -EINVAL; > + > + return __iommu_sva_bind_device(dev, bind_data->vfio_mm->mm, > + &bind_data->vfio_mm->pasid, > + IOMMU_SVA_FEAT_IOPF, bind_data->iommu); > +} > + > +static int vfio_iommu_sva_unbind_dev(struct device *dev, void *data) > +{ > + struct vfio_mm *vfio_mm = data; > + > + return __iommu_sva_unbind_device(dev, vfio_mm->pasid); > +} > + > +static int vfio_iommu_bind_group(struct vfio_iommu *iommu, > + struct vfio_group *group, > + struct vfio_mm *vfio_mm) > +{ > + int ret; > + bool enabled_sva = false; > + struct vfio_iommu_sva_bind_data data = { > + .vfio_mm = vfio_mm, > + .iommu = iommu, > + .count = 0, > + }; > + > + if (!group->sva_enabled) { > + ret = iommu_group_for_each_dev(group->iommu_group, NULL, > + vfio_iommu_sva_init); > + if (ret) > + return ret; > + > + group->sva_enabled = enabled_sva = true; > + } > + > + ret = iommu_group_for_each_dev(group->iommu_group, &data, > + vfio_iommu_sva_bind_dev); > + if (ret && data.count > 1) Are we safe to run this even if data.count == 1? I assume that at that point we would always not have an iommu domain associated with the device so the initial check would error out. Just be nice to get rid of the special casing in this error path as then could just do it all under if (ret) and make it visually clearer these are different aspects of the same error path. > + iommu_group_for_each_dev(group->iommu_group, vfio_mm, > + vfio_iommu_sva_unbind_dev); > + if (ret && enabled_sva) { > + iommu_group_for_each_dev(group->iommu_group, NULL, > + vfio_iommu_sva_shutdown); > + group->sva_enabled = false; > + } > + > + return ret; > +} > + > +static void vfio_iommu_unbind_group(struct vfio_group *group, > + struct vfio_mm *vfio_mm) > +{ > + iommu_group_for_each_dev(group->iommu_group, vfio_mm, > + vfio_iommu_sva_unbind_dev); > +} > + > +static void vfio_iommu_unbind(struct vfio_iommu *iommu, > + struct vfio_mm *vfio_mm) > +{ > + struct vfio_group *group; > + struct vfio_domain *domain; > + > + list_for_each_entry(domain, &iommu->domain_list, next) > + list_for_each_entry(group, &domain->group_list, next) > + vfio_iommu_unbind_group(group, vfio_mm); > +} > + > +static int vfio_iommu_replay_bind(struct vfio_iommu *iommu, > + struct vfio_group *group) > +{ > + int ret = 0; > + struct vfio_mm *vfio_mm; > + > + list_for_each_entry(vfio_mm, &iommu->mm_list, next) { > + /* > + * Ensure that mm doesn't exit while we're binding it to the new > + * group. It may already have exited, and the mm_exit notifier > + * might be waiting on the IOMMU mutex to remove this vfio_mm > + * from the list. > + */ > + if (!mmget_not_zero(vfio_mm->mm)) > + continue; > + ret = vfio_iommu_bind_group(iommu, group, vfio_mm); > + /* > + * Use async to avoid triggering an mm_exit callback right away, > + * which would block on the mutex that we're holding. > + */ > + mmput_async(vfio_mm->mm); > + > + if (ret) > + goto out_unbind; > + } > + > + return 0; > + > +out_unbind: > + list_for_each_entry_continue_reverse(vfio_mm, &iommu->mm_list, next) > + vfio_iommu_unbind_group(group, vfio_mm); > + > + return ret; > +} > + > +static void vfio_iommu_free_all_mm(struct vfio_iommu *iommu) > +{ > + struct vfio_mm *vfio_mm, *next; > + > + /* > + * No need for unbind() here. Since all groups are detached from this > + * iommu, bonds have been removed. > + */ > + list_for_each_entry_safe(vfio_mm, next, &iommu->mm_list, next) > + kfree(vfio_mm); > + INIT_LIST_HEAD(&iommu->mm_list); > +} > + > /* > * We change our unmap behavior slightly depending on whether the IOMMU > * supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage > @@ -1313,6 +1481,44 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base) > return ret; > } > > +static int vfio_iommu_try_attach_group(struct vfio_iommu *iommu, > + struct vfio_group *group, > + struct vfio_domain *cur_domain, > + struct vfio_domain *new_domain) > +{ > + struct iommu_group *iommu_group = group->iommu_group; > + > + /* > + * Try to match an existing compatible domain. We don't want to > + * preclude an IOMMU driver supporting multiple bus_types and being > + * able to include different bus_types in the same IOMMU domain, so > + * we test whether the domains use the same iommu_ops rather than > + * testing if they're on the same bus_type. > + */ > + if (new_domain->domain->ops != cur_domain->domain->ops || > + new_domain->prot != cur_domain->prot) > + return 1; > + > + iommu_detach_group(cur_domain->domain, iommu_group); > + > + if (iommu_attach_group(new_domain->domain, iommu_group)) > + goto out_reattach; > + > + if (vfio_iommu_replay_bind(iommu, group)) > + goto out_detach; > + > + return 0; > + > +out_detach: > + iommu_detach_group(new_domain->domain, iommu_group); > + > +out_reattach: > + if (iommu_attach_group(cur_domain->domain, iommu_group)) > + return -EINVAL; > + > + return 1; > +} > + > static int vfio_iommu_type1_attach_group(void *iommu_data, > struct iommu_group *iommu_group) > { > @@ -1410,28 +1616,16 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, > if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY)) > domain->prot |= IOMMU_CACHE; > > - /* > - * Try to match an existing compatible domain. We don't want to > - * preclude an IOMMU driver supporting multiple bus_types and being > - * able to include different bus_types in the same IOMMU domain, so > - * we test whether the domains use the same iommu_ops rather than > - * testing if they're on the same bus_type. > - */ > list_for_each_entry(d, &iommu->domain_list, next) { > - if (d->domain->ops == domain->domain->ops && > - d->prot == domain->prot) { > - iommu_detach_group(domain->domain, iommu_group); > - if (!iommu_attach_group(d->domain, iommu_group)) { > - list_add(&group->next, &d->group_list); > - iommu_domain_free(domain->domain); > - kfree(domain); > - mutex_unlock(&iommu->lock); > - return 0; > - } > - > - ret = iommu_attach_group(domain->domain, iommu_group); > - if (ret) > - goto out_domain; > + ret = vfio_iommu_try_attach_group(iommu, group, domain, d); > + if (ret < 0) { > + goto out_domain; > + } else if (!ret) { > + list_add(&group->next, &d->group_list); > + iommu_domain_free(domain->domain); > + kfree(domain); > + mutex_unlock(&iommu->lock); > + return 0; > } > } > > @@ -1442,6 +1636,10 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, > if (ret) > goto out_detach; > > + ret = vfio_iommu_replay_bind(iommu, group); > + if (ret) > + goto out_detach; > + > if (resv_msi) { > ret = iommu_get_msi_cookie(domain->domain, resv_msi_base); > if (ret) > @@ -1547,6 +1745,11 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, > continue; > > iommu_detach_group(domain->domain, iommu_group); > + if (group->sva_enabled) { > + iommu_group_for_each_dev(iommu_group, NULL, > + vfio_iommu_sva_shutdown); > + group->sva_enabled = false; > + } > list_del(&group->next); > kfree(group); > /* > @@ -1562,6 +1765,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, > vfio_iommu_unmap_unpin_all(iommu); > else > vfio_iommu_unmap_unpin_reaccount(iommu); > + vfio_iommu_free_all_mm(iommu); > } > iommu_domain_free(domain->domain); > list_del(&domain->next); > @@ -1596,6 +1800,7 @@ static void *vfio_iommu_type1_open(unsigned long arg) > } > > INIT_LIST_HEAD(&iommu->domain_list); > + INIT_LIST_HEAD(&iommu->mm_list); > iommu->dma_list = RB_ROOT; > mutex_init(&iommu->lock); > BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier); > @@ -1630,6 +1835,7 @@ static void vfio_iommu_type1_release(void *iommu_data) > kfree(iommu->external_domain); > } > > + vfio_iommu_free_all_mm(iommu); > vfio_iommu_unmap_unpin_all(iommu); > > list_for_each_entry_safe(domain, domain_tmp, > @@ -1658,6 +1864,169 @@ static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu) > return ret; > } > > +static struct mm_struct *vfio_iommu_get_mm_by_vpid(pid_t vpid) > +{ > + struct mm_struct *mm; > + struct task_struct *task; > + > + task = find_get_task_by_vpid(vpid); > + if (!task) > + return ERR_PTR(-ESRCH); > + > + /* Ensure that current has RW access on the mm */ > + mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS); > + put_task_struct(task); > + > + if (!mm) > + return ERR_PTR(-ESRCH); > + > + return mm; > +} > + > +static long vfio_iommu_type1_bind_process(struct vfio_iommu *iommu, > + void __user *arg, > + struct vfio_iommu_type1_bind *bind) > +{ > + struct vfio_iommu_type1_bind_process params; > + struct vfio_domain *domain; > + struct vfio_group *group; > + struct vfio_mm *vfio_mm; > + struct mm_struct *mm; > + unsigned long minsz; > + int ret = 0; > + > + minsz = sizeof(*bind) + sizeof(params); > + if (bind->argsz < minsz) > + return -EINVAL; > + > + arg += sizeof(*bind); > + if (copy_from_user(¶ms, arg, sizeof(params))) > + return -EFAULT; > + > + if (params.flags & ~VFIO_IOMMU_BIND_PID) > + return -EINVAL; > + > + if (params.flags & VFIO_IOMMU_BIND_PID) { > + mm = vfio_iommu_get_mm_by_vpid(params.pid); > + if (IS_ERR(mm)) > + return PTR_ERR(mm); > + } else { > + mm = get_task_mm(current); > + if (!mm) > + return -EINVAL; > + } > + > + mutex_lock(&iommu->lock); > + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { > + ret = -EINVAL; > + goto out_unlock; > + } > + > + list_for_each_entry(vfio_mm, &iommu->mm_list, next) { > + if (vfio_mm->mm == mm) { > + params.pasid = vfio_mm->pasid; > + ret = copy_to_user(arg, ¶ms, sizeof(params)) ? > + -EFAULT : 0; > + goto out_unlock; > + } > + } > + > + vfio_mm = kzalloc(sizeof(*vfio_mm), GFP_KERNEL); > + if (!vfio_mm) { > + ret = -ENOMEM; > + goto out_unlock; > + } > + > + vfio_mm->mm = mm; > + vfio_mm->pasid = VFIO_PASID_INVALID; > + > + list_for_each_entry(domain, &iommu->domain_list, next) { > + list_for_each_entry(group, &domain->group_list, next) { > + ret = vfio_iommu_bind_group(iommu, group, vfio_mm); > + if (ret) > + goto out_unbind; > + } > + } > + > + list_add(&vfio_mm->next, &iommu->mm_list); > + > + params.pasid = vfio_mm->pasid; > + ret = copy_to_user(arg, ¶ms, sizeof(params)) ? -EFAULT : 0; > + if (ret) > + goto out_delete; > + > + mutex_unlock(&iommu->lock); > + mmput(mm); > + return 0; > + > +out_delete: > + list_del(&vfio_mm->next); > + > +out_unbind: > + /* Undo all binds that already succeeded */ > + vfio_iommu_unbind(iommu, vfio_mm); > + kfree(vfio_mm); > + > +out_unlock: > + mutex_unlock(&iommu->lock); > + mmput(mm); > + return ret; > +} > + > +static long vfio_iommu_type1_unbind_process(struct vfio_iommu *iommu, > + void __user *arg, > + struct vfio_iommu_type1_bind *bind) > +{ > + int ret = -EINVAL; > + unsigned long minsz; > + struct mm_struct *mm; > + struct vfio_mm *vfio_mm; > + struct vfio_iommu_type1_bind_process params; > + > + minsz = sizeof(*bind) + sizeof(params); > + if (bind->argsz < minsz) > + return -EINVAL; > + > + arg += sizeof(*bind); > + if (copy_from_user(¶ms, arg, sizeof(params))) > + return -EFAULT; > + > + if (params.flags & ~VFIO_IOMMU_BIND_PID) > + return -EINVAL; > + > + /* > + * We can't simply call unbind with the PASID, because the process might > + * have died and the PASID might have been reallocated to another > + * process. Instead we need to fetch that process mm by PID again to > + * make sure we remove the right vfio_mm. > + */ > + if (params.flags & VFIO_IOMMU_BIND_PID) { > + mm = vfio_iommu_get_mm_by_vpid(params.pid); > + if (IS_ERR(mm)) > + return PTR_ERR(mm); > + } else { > + mm = get_task_mm(current); > + if (!mm) > + return -EINVAL; > + } > + > + ret = -ESRCH; > + mutex_lock(&iommu->lock); > + list_for_each_entry(vfio_mm, &iommu->mm_list, next) { > + if (vfio_mm->mm == mm) { > + vfio_iommu_unbind(iommu, vfio_mm); > + list_del(&vfio_mm->next); > + kfree(vfio_mm); > + ret = 0; > + break; > + } > + } > + mutex_unlock(&iommu->lock); > + mmput(mm); > + > + return ret; > +} > + > static long vfio_iommu_type1_ioctl(void *iommu_data, > unsigned int cmd, unsigned long arg) > { > @@ -1728,6 +2097,44 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, > > return copy_to_user((void __user *)arg, &unmap, minsz) ? > -EFAULT : 0; > + > + } else if (cmd == VFIO_IOMMU_BIND) { > + struct vfio_iommu_type1_bind bind; > + > + minsz = offsetofend(struct vfio_iommu_type1_bind, flags); > + > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (bind.argsz < minsz) > + return -EINVAL; > + > + switch (bind.flags) { > + case VFIO_IOMMU_BIND_PROCESS: > + return vfio_iommu_type1_bind_process(iommu, (void *)arg, > + &bind); Can we combine these two blocks given it is only this case statement that is different? > + default: > + return -EINVAL; > + } > + > + } else if (cmd == VFIO_IOMMU_UNBIND) { > + struct vfio_iommu_type1_bind bind; > + > + minsz = offsetofend(struct vfio_iommu_type1_bind, flags); > + > + if (copy_from_user(&bind, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (bind.argsz < minsz) > + return -EINVAL; > + > + switch (bind.flags) { > + case VFIO_IOMMU_BIND_PROCESS: > + return vfio_iommu_type1_unbind_process(iommu, (void *)arg, > + &bind); > + default: > + return -EINVAL; > + } > } > > return -ENOTTY; > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > index 1aa7b82e8169..dc07752c8fe8 100644 > --- a/include/uapi/linux/vfio.h > +++ b/include/uapi/linux/vfio.h > @@ -665,6 +665,82 @@ struct vfio_iommu_type1_dma_unmap { > #define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15) > #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16) > > +/* > + * VFIO_IOMMU_BIND_PROCESS > + * > + * Allocate a PASID for a process address space, and use it to attach this > + * process to all devices in the container. Devices can then tag their DMA > + * traffic with the returned @pasid to perform transactions on the associated > + * virtual address space. Mapping and unmapping buffers is performed by standard > + * functions such as mmap and malloc. > + * > + * If flag is VFIO_IOMMU_BIND_PID, @pid contains the pid of a foreign process to > + * bind. Otherwise the current task is bound. Given that the caller owns the > + * device, setting this flag grants the caller read and write permissions on the > + * entire address space of foreign process described by @pid. Therefore, > + * permission to perform the bind operation on a foreign process is governed by > + * the ptrace access mode PTRACE_MODE_ATTACH_REALCREDS check. See man ptrace(2) > + * for more information. > + * > + * On success, VFIO writes a Process Address Space ID (PASID) into @pasid. This > + * ID is unique to a process and can be used on all devices in the container. > + * > + * On fork, the child inherits the device fd and can use the bonds setup by its > + * parent. Consequently, the child has R/W access on the address spaces bound by > + * its parent. After an execv, the device fd is closed and the child doesn't > + * have access to the address space anymore. > + * > + * To remove a bond between process and container, VFIO_IOMMU_UNBIND ioctl is > + * issued with the same parameters. If a pid was specified in VFIO_IOMMU_BIND, > + * it should also be present for VFIO_IOMMU_UNBIND. Otherwise unbind the current > + * task from the container. > + */ > +struct vfio_iommu_type1_bind_process { > + __u32 flags; > +#define VFIO_IOMMU_BIND_PID (1 << 0) > + __u32 pasid; > + __s32 pid; > +}; > + > +/* > + * Only mode supported at the moment is VFIO_IOMMU_BIND_PROCESS, which takes > + * vfio_iommu_type1_bind_process in data. > + */ > +struct vfio_iommu_type1_bind { > + __u32 argsz; > + __u32 flags; > +#define VFIO_IOMMU_BIND_PROCESS (1 << 0) > + __u8 data[]; > +}; > + > +/* > + * VFIO_IOMMU_BIND - _IOWR(VFIO_TYPE, VFIO_BASE + 22, struct vfio_iommu_bind) > + * > + * Manage address spaces of devices in this container. Initially a TYPE1 > + * container can only have one address space, managed with > + * VFIO_IOMMU_MAP/UNMAP_DMA. > + * > + * An IOMMU of type VFIO_TYPE1_NESTING_IOMMU can be managed by both MAP/UNMAP > + * and BIND ioctls at the same time. MAP/UNMAP acts on the stage-2 (host) page > + * tables, and BIND manages the stage-1 (guest) page tables. Other types of > + * IOMMU may allow MAP/UNMAP and BIND to coexist, where MAP/UNMAP controls > + * non-PASID traffic and BIND controls PASID traffic. But this depends on the > + * underlying IOMMU architecture and isn't guaranteed. > + * > + * Availability of this feature depends on the device, its bus, the underlying > + * IOMMU and the CPU architecture. > + * > + * returns: 0 on success, -errno on failure. > + */ > +#define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 22) > + > +/* > + * VFIO_IOMMU_UNBIND - _IOWR(VFIO_TYPE, VFIO_BASE + 23, struct vfio_iommu_bind) > + * > + * Undo what was done by the corresponding VFIO_IOMMU_BIND ioctl. > + */ > +#define VFIO_IOMMU_UNBIND _IO(VFIO_TYPE, VFIO_BASE + 23) > + > /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ > > /* -- To unsubscribe from this list: send the line "unsubscribe devicetree" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html