Hi Alex, > -----Original Message----- > From: Alex Williamson [mailto:alex.williamson@xxxxxxxxxx] > Sent: Saturday, October 03, 2015 4:16 AM > To: Bhushan Bharat-R65777 <Bharat.Bhushan@xxxxxxxxxxxxx> > Cc: kvmarm@xxxxxxxxxxxxxxxxxxxxx; kvm@xxxxxxxxxxxxxxx; > christoffer.dall@xxxxxxxxxx; eric.auger@xxxxxxxxxx; pranavkumar@xxxxxxxxxx; > marc.zyngier@xxxxxxx; will.deacon@xxxxxxx > Subject: Re: [RFC PATCH 1/6] vfio: Add interface for add/del reserved iova > region > > On Wed, 2015-09-30 at 20:26 +0530, Bharat Bhushan wrote: > > This Patch adds the VFIO APIs to add and remove reserved iova regions. > > The reserved iova region can be used for mapping some specific > > physical address in iommu. > > > > Currently we are planning to use this interface for adding iova > > regions for creating iommu of msi-pages. But the API are designed for > > future extension where some other physical address can be mapped. > > > > Signed-off-by: Bharat Bhushan <Bharat.Bhushan@xxxxxxxxxxxxx> > > --- > > drivers/vfio/vfio_iommu_type1.c | 201 > +++++++++++++++++++++++++++++++++++++++- > > include/uapi/linux/vfio.h | 43 +++++++++ > > 2 files changed, 243 insertions(+), 1 deletion(-) > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c > > b/drivers/vfio/vfio_iommu_type1.c index 57d8c37..fa5d3e4 100644 > > --- a/drivers/vfio/vfio_iommu_type1.c > > +++ b/drivers/vfio/vfio_iommu_type1.c > > @@ -59,6 +59,7 @@ struct vfio_iommu { > > struct rb_root dma_list; > > bool v2; > > bool nesting; > > + struct list_head reserved_iova_list; > > This alignment leads to poor packing in the structure, put it above the bools. ok > > > }; > > > > struct vfio_domain { > > @@ -77,6 +78,15 @@ struct vfio_dma { > > int prot; /* IOMMU_READ/WRITE */ > > }; > > > > +struct vfio_resvd_region { > > + dma_addr_t iova; > > + size_t size; > > + int prot; /* IOMMU_READ/WRITE */ > > + int refcount; /* ref count of mappings */ > > + uint64_t map_paddr; /* Mapped Physical Address > */ > > phys_addr_t Ok, > > > + struct list_head next; > > +}; > > + > > struct vfio_group { > > struct iommu_group *iommu_group; > > struct list_head next; > > @@ -106,6 +116,38 @@ static struct vfio_dma *vfio_find_dma(struct > vfio_iommu *iommu, > > return NULL; > > } > > > > +/* This function must be called with iommu->lock held */ static bool > > +vfio_overlap_with_resvd_region(struct vfio_iommu *iommu, > > + dma_addr_t start, size_t size) { > > + struct vfio_resvd_region *region; > > + > > + list_for_each_entry(region, &iommu->reserved_iova_list, next) { > > + if (region->iova < start) > > + return (start - region->iova < region->size); > > + else if (start < region->iova) > > + return (region->iova - start < size); > > <= on both of the return lines? I think is should be "<" and not "=<", no ? > > > + > > + return (region->size > 0 && size > 0); > > + } > > + > > + return false; > > +} > > + > > +/* This function must be called with iommu->lock held */ static > > +struct vfio_resvd_region *vfio_find_resvd_region(struct vfio_iommu > *iommu, > > + dma_addr_t start, size_t > size) { > > + struct vfio_resvd_region *region; > > + > > + list_for_each_entry(region, &iommu->reserved_iova_list, next) > > + if (region->iova == start && region->size == size) > > + return region; > > + > > + return NULL; > > +} > > + > > static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma > > *new) { > > struct rb_node **link = &iommu->dma_list.rb_node, *parent = > NULL; @@ > > -580,7 +622,8 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, > > > > mutex_lock(&iommu->lock); > > > > - if (vfio_find_dma(iommu, iova, size)) { > > + if (vfio_find_dma(iommu, iova, size) || > > + vfio_overlap_with_resvd_region(iommu, iova, size)) { > > mutex_unlock(&iommu->lock); > > return -EEXIST; > > } > > @@ -626,6 +669,127 @@ static int vfio_dma_do_map(struct vfio_iommu > *iommu, > > return ret; > > } > > > > +/* This function must be called with iommu->lock held */ static int > > +vfio_iommu_resvd_region_del(struct vfio_iommu *iommu, > > + dma_addr_t iova, size_t size, int prot) { > > + struct vfio_resvd_region *res_region; > > Have some consistency in naming, just use "region". Ok, > > + > > + res_region = vfio_find_resvd_region(iommu, iova, size); > > + /* Region should not be mapped in iommu */ > > + if (res_region == NULL || res_region->map_paddr) > > + return -EINVAL; > > Are these two separate errors? !region is -EINVAL, but being mapped is - > EBUSY. Yes, will separate them. > > > + > > + list_del(&res_region->next); > > + kfree(res_region); > > + return 0; > > +} > > + > > +/* This function must be called with iommu->lock held */ static int > > +vfio_iommu_resvd_region_add(struct vfio_iommu *iommu, > > + dma_addr_t iova, size_t size, int prot) { > > + struct vfio_resvd_region *res_region; > > + > > + /* Check overlap with with dma maping and reserved regions */ > > + if (vfio_find_dma(iommu, iova, size) || > > + vfio_find_resvd_region(iommu, iova, size)) > > + return -EEXIST; > > + > > + res_region = kzalloc(sizeof(*res_region), GFP_KERNEL); > > + if (res_region == NULL) > > + return -ENOMEM; > > + > > + res_region->iova = iova; > > + res_region->size = size; > > + res_region->prot = prot; > > + res_region->refcount = 0; > > + res_region->map_paddr = 0; > > They're already 0 by the kzalloc Yes ;) > > > + > > + list_add(&res_region->next, &iommu->reserved_iova_list); > > + > > + return 0; > > +} > > + > > +static > > +int vfio_handle_reserved_region_add(struct vfio_iommu *iommu, > > + struct vfio_iommu_reserved_region_add > *region) { > > + dma_addr_t iova = region->iova; > > + size_t size = region->size; > > + int flags = region->flags; > > + uint64_t mask; > > + int prot = 0; > > + int ret; > > + > > + /* Verify that none of our __u64 fields overflow */ > > + if (region->size != size || region->iova != iova) > > + return -EINVAL; > > + > > + mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1; > > + > > + WARN_ON(mask & PAGE_MASK); > > + > > + if (flags & VFIO_IOMMU_RES_REGION_READ) > > + prot |= IOMMU_READ; > > + if (flags & VFIO_IOMMU_RES_REGION_WRITE) > > + prot |= IOMMU_WRITE; > > + > > + if (!prot || !size || (size | iova) & mask) > > + return -EINVAL; > > + > > + /* Don't allow IOVA wrap */ > > + if (iova + size - 1 < iova) > > + return -EINVAL; > > + > > + mutex_lock(&iommu->lock); > > + > > + if (region->flags & VFIO_IOMMU_RES_REGION_ADD) { > > + ret = vfio_iommu_resvd_region_add(iommu, iova, size, > prot); > > + if (ret) { > > + mutex_unlock(&iommu->lock); > > + return ret; > > + } > > + } > > Silently fail if not VFIO_IOMMU_RES_REGION_ADD? As per below comment we do not need this flag. So the above flag checking will be removed. > > > + > > + mutex_unlock(&iommu->lock); > > + return 0; > > +} > > + > > +static > > +int vfio_handle_reserved_region_del(struct vfio_iommu *iommu, > > + struct vfio_iommu_reserved_region_del > *region) { > > + dma_addr_t iova = region->iova; > > + size_t size = region->size; > > + int flags = region->flags; > > + int ret; > > + > > + if (!(flags & VFIO_IOMMU_RES_REGION_DEL)) > > + return -EINVAL; > > + > > + mutex_lock(&iommu->lock); > > + > > + /* Check for the region */ > > + if (vfio_find_resvd_region(iommu, iova, size) == NULL) { > > + mutex_unlock(&iommu->lock); > > + return -EINVAL; > > + } > > + > > + /* remove the reserved region */ > > + if (region->flags & VFIO_IOMMU_RES_REGION_DEL) { > > + ret = vfio_iommu_resvd_region_del(iommu, iova, size, > flags); > > + if (ret) { > > + mutex_unlock(&iommu->lock); > > + return ret; > > + } > > + } > > + > > + mutex_unlock(&iommu->lock); > > + return 0; > > +} > > + > > static int vfio_bus_type(struct device *dev, void *data) { > > struct bus_type **bus = data; > > @@ -905,6 +1069,7 @@ static void *vfio_iommu_type1_open(unsigned > long arg) > > } > > > > INIT_LIST_HEAD(&iommu->domain_list); > > + INIT_LIST_HEAD(&iommu->reserved_iova_list); > > iommu->dma_list = RB_ROOT; > > mutex_init(&iommu->lock); > > > > @@ -1020,6 +1185,40 @@ static long vfio_iommu_type1_ioctl(void > *iommu_data, > > return ret; > > > > return copy_to_user((void __user *)arg, &unmap, minsz); > > + } else if (cmd == VFIO_IOMMU_RESERVED_REGION_ADD) { > > + struct vfio_iommu_reserved_region_add region; > > + long ret; > > + > > + minsz = offsetofend(struct > vfio_iommu_reserved_region_add, > > + size); > > + if (copy_from_user(®ion, (void __user *)arg, minsz)) > > + return -EFAULT; > > + > > + if (region.argsz < minsz) > > + return -EINVAL; > > + > > + ret = vfio_handle_reserved_region_add(iommu, ®ion); > > + if (ret) > > + return ret; > > + > > + return copy_to_user((void __user *)arg, ®ion, minsz); > > + } else if (cmd == VFIO_IOMMU_RESERVED_REGION_DEL) { > > + struct vfio_iommu_reserved_region_del region; > > + long ret; > > + > > + minsz = offsetofend(struct > vfio_iommu_reserved_region_del, > > + size); > > + if (copy_from_user(®ion, (void __user *)arg, minsz)) > > + return -EFAULT; > > + > > + if (region.argsz < minsz) > > + return -EINVAL; > > + > > + ret = vfio_handle_reserved_region_del(iommu, ®ion); > > + if (ret) > > + return ret; > > + > > + return copy_to_user((void __user *)arg, ®ion, minsz); > > So we've just created an interface that is available for all vfio-type1 users, > whether it makes any sense for the platform or not, How we should decide that a given platform needs this or not? > and it allows the user to > consume arbitrary amounts of kernel memory, by making an infinitely long > list of reserved iova entries, brilliant! I was not sure of how to limit the user. What I was thinking of having a default number of pages a user can reserve (512 pages). Also we can give a sysfs interface so that user can change the default number of pages. Does this sound good? If not please suggest? > > > } > > > > return -ENOTTY; > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > > index b57b750..1abd1a9 100644 > > --- a/include/uapi/linux/vfio.h > > +++ b/include/uapi/linux/vfio.h > > @@ -440,6 +440,49 @@ struct vfio_iommu_type1_dma_unmap { > > #define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15) > > #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16) > > > > +/**************** Reserved IOVA region specific APIs > > +**********************/ > > + > > +/* > > + * VFIO_IOMMU_RESERVED_REGION_ADD - _IO(VFIO_TYPE, VFIO_BASE > + 17, > > + * struct > vfio_iommu_reserved_region_add) > > + * This is used to add a reserved iova region. > > + * @flags - Input: VFIO_IOMMU_RES_REGION_ADD flag is for adding > > + * a reserved region. > > Why else would we call VFIO_IOMMU_RESERVED_REGION_ADD except to > add a region, this flag is redundant. Ok, will remove this. > > > + * Also pass READ/WRITE/IOMMU flags to be used in iommu mapping. > > + * @iova - Input: IOVA base address of reserved region > > + * @size - Input: Size of the reserved region > > + * Return: 0 on success, -errno on failure */ struct > > +vfio_iommu_reserved_region_add { > > + __u32 argsz; > > + __u32 flags; > > +#define VFIO_IOMMU_RES_REGION_ADD (1 << 0) /* Add a reserved > region */ > > +#define VFIO_IOMMU_RES_REGION_READ (1 << 1) /* readable region */ > > +#define VFIO_IOMMU_RES_REGION_WRITE (1 << 2) /* writable > region */ > > + __u64 iova; > > + __u64 size; > > +}; > > +#define VFIO_IOMMU_RESERVED_REGION_ADD _IO(VFIO_TYPE, > VFIO_BASE + 17) > > + > > +/* > > + * VFIO_IOMMU_RESERVED_REGION_DEL - _IO(VFIO_TYPE, VFIO_BASE + > 18, > > + * struct > vfio_iommu_reserved_region_del) > > + * This is used to delete an existing reserved iova region. > > + * @flags - VFIO_IOMMU_RES_REGION_DEL flag is for deleting a region > > +use, > > + * only a unmapped region can be deleted. > > + * @iova - Input: IOVA base address of reserved region > > + * @size - Input: Size of the reserved region > > + * Return: 0 on success, -errno on failure */ struct > > +vfio_iommu_reserved_region_del { > > + __u32 argsz; > > + __u32 flags; > > +#define VFIO_IOMMU_RES_REGION_DEL (1 << 0) /* unset the > reserved region */ > > + __u64 iova; > > + __u64 size; > > +}; > > +#define VFIO_IOMMU_RESERVED_REGION_DEL _IO(VFIO_TYPE, > VFIO_BASE + 18) > > + > > These are effectively both > > struct vfio_iommu_type1_dma_unmap Yes, do you want to suggest that we should use " struct vfio_iommu_type1_dma_unmap". I found that confusing. What is we use "struct vfio_iommu_reserved_region" and use flag VFIO_IOMMU_RES_REGION_DEL/ VFIO_IOMMU_RES_REGION_ADD. Thanks -Bharat > > > /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU > > -------- */ > > > > /* > > ��.n��������+%������w��{.n�����o�^n�r������&��z�ޗ�zf���h���~����������_��+v���)ߣ�