On Tue, 2014-09-23 at 13:01 +1000, Alexey Kardashevskiy wrote: > This defines and implements VFIO IOMMU API which lets the userspace > create and remove DMA windows. > > This updates VFIO_IOMMU_SPAPR_TCE_GET_INFO to return the number of > available windows and page mask. > > This adds VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE > to allow the user space to create and remove window(s). > > The VFIO IOMMU driver does basic sanity checks and calls corresponding > SPAPR TCE functions. At the moment only IODA2 (POWER8 PCI host bridge) > implements them. > > This advertises VFIO_IOMMU_SPAPR_TCE_FLAG_DDW capability via > VFIO_IOMMU_SPAPR_TCE_GET_INFO. > > This calls platform DDW reset() callback when IOMMU is being disabled > to reset the DMA configuration to its original state. > > Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx> > --- > drivers/vfio/vfio_iommu_spapr_tce.c | 135 ++++++++++++++++++++++++++++++++++-- > include/uapi/linux/vfio.h | 25 ++++++- > 2 files changed, 153 insertions(+), 7 deletions(-) > > diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c > index 0dccbc4..b518891 100644 > --- a/drivers/vfio/vfio_iommu_spapr_tce.c > +++ b/drivers/vfio/vfio_iommu_spapr_tce.c > @@ -190,18 +190,25 @@ static void tce_iommu_disable(struct tce_container *container) > > container->enabled = false; > > - if (!container->grp || !current->mm) > + if (!container->grp) > return; > > data = iommu_group_get_iommudata(container->grp); > if (!data || !data->iommu_owner || !data->ops->get_table) > return; > > - tbl = data->ops->get_table(data, 0); > - if (!tbl) > - return; > + if (current->mm) { > + tbl = data->ops->get_table(data, 0); > + if (tbl) > + decrement_locked_vm(tbl); > > - decrement_locked_vm(tbl); > + tbl = data->ops->get_table(data, 1); > + if (tbl) > + decrement_locked_vm(tbl); > + } > + > + if (data->ops->reset) > + data->ops->reset(data); > } > > static void *tce_iommu_open(unsigned long arg) > @@ -243,7 +250,7 @@ static long tce_iommu_ioctl(void *iommu_data, > unsigned int cmd, unsigned long arg) > { > struct tce_container *container = iommu_data; > - unsigned long minsz; > + unsigned long minsz, ddwsz; > long ret; > > switch (cmd) { > @@ -288,6 +295,28 @@ static long tce_iommu_ioctl(void *iommu_data, > info.dma32_window_size = tbl->it_size << tbl->it_page_shift; > info.flags = 0; > > + ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, > + page_size_mask); > + > + if (info.argsz == ddwsz) { >= > + if (data->ops->query && data->ops->create && > + data->ops->remove) { > + info.flags |= VFIO_IOMMU_SPAPR_TCE_FLAG_DDW; I think you want to set this flag regardless of whether the user has provided space for it. A valid use model is to call with the minimum size and look at the flags to determine if it needs to be called again with a larger size. > + > + ret = data->ops->query(data, > + &info.current_windows, > + &info.windows_available, > + &info.page_size_mask); > + if (ret) > + return ret; > + } else { > + info.current_windows = 0; > + info.windows_available = 0; > + info.page_size_mask = 0; > + } > + minsz = ddwsz; It's not really any longer the min size, is it? > + } > + > if (copy_to_user((void __user *)arg, &info, minsz)) > return -EFAULT; > > @@ -412,12 +441,106 @@ static long tce_iommu_ioctl(void *iommu_data, > tce_iommu_disable(container); > mutex_unlock(&container->lock); > return 0; > + > case VFIO_EEH_PE_OP: > if (!container->grp) > return -ENODEV; > > return vfio_spapr_iommu_eeh_ioctl(container->grp, > cmd, arg); > + > + case VFIO_IOMMU_SPAPR_TCE_CREATE: { > + struct vfio_iommu_spapr_tce_create create; > + struct spapr_tce_iommu_group *data; > + struct iommu_table *tbl; > + > + if (WARN_ON(!container->grp)) redux previous comment on this warning > + return -ENXIO; > + > + data = iommu_group_get_iommudata(container->grp); > + > + minsz = offsetofend(struct vfio_iommu_spapr_tce_create, > + start_addr); > + > + if (copy_from_user(&create, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (create.argsz < minsz) > + return -EINVAL; > + > + if (create.flags) > + return -EINVAL; > + > + if (!data->ops->create || !data->iommu_owner) > + return -ENOSYS; > + > + BUG_ON(!data || !data->ops || !data->ops->remove); Little late for this test since we'll oops on the previous test. Why is this a BUG_ON? A user could exploit this on a system with only a partial set of callbacks. > + > + ret = data->ops->create(data, create.page_shift, > + create.window_shift, &tbl); > + if (ret) > + return ret; > + > + ret = try_increment_locked_vm(tbl); > + if (ret) { > + data->ops->remove(data, tbl); > + return ret; > + } > + > + create.start_addr = tbl->it_offset << tbl->it_page_shift; > + > + if (copy_to_user((void __user *)arg, &create, minsz)) { > + data->ops->remove(data, tbl); > + decrement_locked_vm(tbl); > + return -EFAULT; > + } > + mutex_lock(&container->lock); > + ++container->windows_num; > + mutex_unlock(&container->lock); > + > + return ret; > + } > + case VFIO_IOMMU_SPAPR_TCE_REMOVE: { > + struct vfio_iommu_spapr_tce_remove remove; > + struct spapr_tce_iommu_group *data; > + struct iommu_table *tbl; > + > + if (WARN_ON(!container->grp)) > + return -ENXIO; > + > + data = iommu_group_get_iommudata(container->grp); > + > + minsz = offsetofend(struct vfio_iommu_spapr_tce_remove, > + start_addr); > + > + if (copy_from_user(&remove, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (remove.argsz < minsz) > + return -EINVAL; > + > + if (remove.flags) > + return -EINVAL; > + > + if (!data->ops->remove || !data->iommu_owner) On this one we don't both to get data/data->ops. Is there also an exploit where the user can call these CREATE/REMOVE interfaces even though INFO doesn't expose them if only a partial set of callbacks are present? > + return -ENOSYS; > + > + tbl = spapr_tce_find_table(container, data, remove.start_addr); What happens if this returns the 0 index rather than the expected 1 index table? Why doesn't this call ops->find_table()? > + if (!tbl) > + return -EINVAL; > + > + ret = data->ops->remove(data, tbl); > + if (ret) > + return ret; > + > + decrement_locked_vm(tbl); > + > + mutex_lock(&container->lock); > + --container->windows_num; > + mutex_unlock(&container->lock); > + > + return 0; > + } > } > > return -ENOTTY; > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > index 6612974..e71a6ef 100644 > --- a/include/uapi/linux/vfio.h > +++ b/include/uapi/linux/vfio.h > @@ -451,9 +451,13 @@ struct vfio_iommu_type1_dma_unmap { > */ > struct vfio_iommu_spapr_tce_info { > __u32 argsz; > - __u32 flags; /* reserved for future use */ > + __u32 flags; > +#define VFIO_IOMMU_SPAPR_TCE_FLAG_DDW 1 /* Support dynamic windows */ > __u32 dma32_window_start; /* 32 bit window start (bytes) */ > __u32 dma32_window_size; /* 32 bit window size (bytes) */ > + __u32 current_windows; > + __u32 windows_available; > + __u32 page_size_mask; > }; > > #define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) > @@ -489,6 +493,25 @@ struct vfio_eeh_pe_op { > > #define VFIO_EEH_PE_OP _IO(VFIO_TYPE, VFIO_BASE + 21) > > +struct vfio_iommu_spapr_tce_create { > + __u32 argsz; > + __u32 flags; > + /* in */ > + __u32 page_shift; > + __u32 window_shift; > + /* out */ > + __u64 start_addr; > +}; > +#define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 18) > + > +struct vfio_iommu_spapr_tce_remove { > + __u32 argsz; > + __u32 flags; > + /* in */ > + __u64 start_addr; > +}; > +#define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 19) > + Zero comments, no good. > /* ***************************************************************** */ > > #endif /* _UAPIVFIO_H */ -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html