Hi Eric, > -----Original Message----- > From: Auger Eric [mailto:eric.auger@xxxxxxxxxx] > Sent: Tuesday, January 23, 2018 8:32 AM > To: Alex Williamson <alex.williamson@xxxxxxxxxx>; Shameerali Kolothum > Thodi <shameerali.kolothum.thodi@xxxxxxxxxx> > Cc: pmorel@xxxxxxxxxxxxxxxxxx; kvm@xxxxxxxxxxxxxxx; linux- > kernel@xxxxxxxxxxxxxxx; Linuxarm <linuxarm@xxxxxxxxxx>; John Garry > <john.garry@xxxxxxxxxx>; xuwei (O) <xuwei5@xxxxxxxxxx> > Subject: Re: [RFC v2 2/5] vfio/type1: Check reserve region conflict and update > iova list > > Hi Shameer, > > On 18/01/18 01:04, Alex Williamson wrote: > > On Fri, 12 Jan 2018 16:45:28 +0000 > > Shameer Kolothum <shameerali.kolothum.thodi@xxxxxxxxxx> wrote: > > > >> This retrieves the reserved regions associated with dev group and > >> checks for conflicts with any existing dma mappings. Also update > >> the iova list excluding the reserved regions. > >> > >> Signed-off-by: Shameer Kolothum > <shameerali.kolothum.thodi@xxxxxxxxxx> > >> --- > >> drivers/vfio/vfio_iommu_type1.c | 161 > +++++++++++++++++++++++++++++++++++++++- > >> 1 file changed, 159 insertions(+), 2 deletions(-) > >> > >> diff --git a/drivers/vfio/vfio_iommu_type1.c > b/drivers/vfio/vfio_iommu_type1.c > >> index 11cbd49..7609070 100644 > >> --- a/drivers/vfio/vfio_iommu_type1.c > >> +++ b/drivers/vfio/vfio_iommu_type1.c > >> @@ -28,6 +28,7 @@ > >> #include <linux/device.h> > >> #include <linux/fs.h> > >> #include <linux/iommu.h> > >> +#include <linux/list_sort.h> > >> #include <linux/module.h> > >> #include <linux/mm.h> > >> #include <linux/rbtree.h> > >> @@ -1199,6 +1200,20 @@ static bool vfio_iommu_has_sw_msi(struct > iommu_group *group, phys_addr_t *base) > >> return ret; > >> } > >> > > > > /* list_sort helper */ > > > >> +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b) > >> +{ > >> + struct iommu_resv_region *ra, *rb; > >> + > >> + ra = container_of(a, struct iommu_resv_region, list); > >> + rb = container_of(b, struct iommu_resv_region, list); > >> + > >> + if (ra->start < rb->start) > >> + return -1; > >> + if (ra->start > rb->start) > >> + return 1; > >> + return 0; > >> +} > >> + > >> static int vfio_insert_iova(phys_addr_t start, phys_addr_t end, > >> struct list_head *head) > >> { > >> @@ -1274,6 +1289,24 @@ static int vfio_iommu_valid_aperture(struct > vfio_iommu *iommu, > >> } > >> > >> /* > >> + * Check reserved region conflicts with existing dma mappings > >> + */ > >> +static int vfio_iommu_resv_region_conflict(struct vfio_iommu *iommu, > >> + struct list_head *resv_regions) > >> +{ > >> + struct iommu_resv_region *region; > >> + > >> + /* Check for conflict with existing dma mappings */ > >> + list_for_each_entry(region, resv_regions, list) { > >> + if (vfio_find_dma_overlap(iommu, region->start, > >> + region->start + region->length - 1)) > >> + return -EINVAL; > >> + } > >> + > >> + return 0; > >> +} > > > > This basically does the same test as vfio_iommu_valid_aperture but > > properly names it a conflict test. Please be consistent. Should this > > also return bool, "conflict" is a yes/no answer. > > > >> + > >> +/* > >> * Adjust the iommu aperture window if new aperture is a valid one > >> */ > >> static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu, > >> @@ -1316,6 +1349,51 @@ static int vfio_iommu_iova_aper_adjust(struct > vfio_iommu *iommu, > >> return 0; > >> } > >> > >> +/* > >> + * Check and update iova region list in case a reserved region > >> + * overlaps the iommu iova range > >> + */ > >> +static int vfio_iommu_iova_resv_adjust(struct vfio_iommu *iommu, > >> + struct list_head *resv_regions) > > > > "resv_region" in previous function, just "resv" here, use consistent > > names. Also, what are we adjusting. Maybe "exclude" is a better term. > > > >> +{ > >> + struct iommu_resv_region *resv; > >> + struct list_head *iova = &iommu->iova_list; > >> + struct vfio_iova *n, *next; > >> + > >> + list_for_each_entry(resv, resv_regions, list) { > >> + phys_addr_t start, end; > >> + > >> + start = resv->start; > >> + end = resv->start + resv->length - 1; > >> + > >> + list_for_each_entry_safe(n, next, iova, list) { > >> + phys_addr_t a, b; > >> + int ret = 0; > >> + > >> + a = n->start; > >> + b = n->end; > > > > 'a' and 'b' variables actually make this incredibly confusing. Use > > better variable names or just drop them entirely, it's much easier to > > follow as n->start & n->end. > > > >> + /* No overlap */ > >> + if ((start > b) || (end < a)) > >> + continue; > >> + /* Split the current node and create holes */ > >> + if (start > a) > >> + ret = vfio_insert_iova(a, start - 1, &n->list); > >> + if (!ret && end < b) > >> + ret = vfio_insert_iova(end + 1, b, &n->list); > >> + if (ret) > >> + return ret; > >> + > >> + list_del(&n->list); > > > > This is trickier than it appears and deserves some explanation. AIUI, > > we're actually inserting duplicate entries for the remainder at the > > start of the range and then at the end of the range (and the order is > > important here because we're inserting each before the current node), > > and then we delete the current node. So the iova_list is kept sorted > > through this process, though temporarily includes some bogus, unordered > > sub-sets. > > > >> + kfree(n); > >> + } > >> + } > >> + > >> + if (list_empty(iova)) > >> + return -EINVAL; > >> + > >> + return 0; > >> +} > >> + > >> static int vfio_iommu_type1_attach_group(void *iommu_data, > >> struct iommu_group *iommu_group) > >> { > >> @@ -1327,6 +1405,8 @@ static int vfio_iommu_type1_attach_group(void > *iommu_data, > >> bool resv_msi, msi_remap; > >> phys_addr_t resv_msi_base; > >> struct iommu_domain_geometry geo; > >> + struct list_head group_resv_regions; > >> + struct iommu_resv_region *resv, *resv_next; > >> > >> mutex_lock(&iommu->lock); > >> > >> @@ -1404,6 +1484,14 @@ static int vfio_iommu_type1_attach_group(void > *iommu_data, > >> if (ret) > >> goto out_detach; > >> > >> + INIT_LIST_HEAD(&group_resv_regions); > >> + iommu_get_group_resv_regions(iommu_group, &group_resv_regions); > >> + list_sort(NULL, &group_resv_regions, vfio_resv_cmp); > iommu_get_group_resv_regions returns a sorted list (see > iommu_insert_resv_regions kerneldoc comment). You can have overlapping > regions of different types though. Hmm..I am not sure. It looks like it is sorted only if the regions are of same type. "* The new element is sorted by address with respect to the other * regions of the same type." So hypothetically if there are two groups with regions like, Group 1. Start size type 0x0000 0x1000 1 0x2000 0x1000 1 0x5000 0x1000 1 Group 2 Start size type 0x2000 0x4000 2 0x7000 0x1000 1 Then the iommu_get_group_resv_regions() will return, 0x0000 0x1000 1 0x2000 0x1000 1 0x5000 0x1000 1 0x2000 0x4000 2 0x7000 0x1000 1 But honestly I am not sure the above is a valid scenario or not. I am happy to remove the sorting if such a case will never happen. Please let me know. Thanks, Shameer > Eric > >> + > >> + ret = vfio_iommu_resv_region_conflict(iommu, &group_resv_regions); > >> + if (ret) > >> + goto out_detach; > >> + > >> resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base); > >> > >> INIT_LIST_HEAD(&domain->group_list); > >> @@ -1434,11 +1522,15 @@ static int vfio_iommu_type1_attach_group(void > *iommu_data, > >> d->prot == domain->prot) { > >> iommu_detach_group(domain->domain, > iommu_group); > >> if (!iommu_attach_group(d->domain, iommu_group)) { > >> + ret = vfio_iommu_iova_resv_adjust(iommu, > >> + > &group_resv_regions); > >> + if (!ret) > >> + goto out_domain; > > > > The above function is not without side effects if it fails, it's > > altered the iova_list. It needs to be valid for the remaining domains > > if we're going to continue. > > > >> + > >> list_add(&group->next, &d->group_list); > >> iommu_domain_free(domain->domain); > >> kfree(domain); > >> - mutex_unlock(&iommu->lock); > >> - return 0; > >> + goto done; > >> } > >> > >> ret = iommu_attach_group(domain->domain, > iommu_group); > >> @@ -1465,8 +1557,15 @@ static int vfio_iommu_type1_attach_group(void > *iommu_data, > >> if (ret) > >> goto out_detach; > >> > >> + ret = vfio_iommu_iova_resv_adjust(iommu, &group_resv_regions); > >> + if (ret) > >> + goto out_detach; > > > > Can't we process the reserved regions once before we get here rather > > than have two separate call points that do the same thing? In order to > > roll back from errors above, it seems like we need to copy iova_list > > and work on the copy, installing it and deleting the original only on > > success. > > > >> + > >> list_add(&domain->next, &iommu->domain_list); > >> > >> +done: > >> + list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list) > >> + kfree(resv); > >> mutex_unlock(&iommu->lock); > >> > >> return 0; > >> @@ -1475,6 +1574,8 @@ static int vfio_iommu_type1_attach_group(void > *iommu_data, > >> iommu_detach_group(domain->domain, iommu_group); > >> out_domain: > >> iommu_domain_free(domain->domain); > >> + list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list) > >> + kfree(resv); > >> out_free: > >> kfree(domain); > >> kfree(group); > >> @@ -1559,6 +1660,60 @@ static void vfio_iommu_iova_aper_refresh(struct > vfio_iommu *iommu) > >> node->end = end; > >> } > >> > >> +/* > >> + * Called when a group is detached. The reserved regions for that > >> + * group can be part of valid iova now. But since reserved regions > >> + * may be duplicated among groups, populate the iova valid regions > >> + list again. > >> + */ > >> +static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu) > >> +{ > >> + struct vfio_domain *d; > >> + struct vfio_group *g; > >> + struct vfio_iova *node, *tmp; > >> + struct iommu_resv_region *resv, *resv_next; > >> + struct list_head resv_regions; > >> + phys_addr_t start, end; > >> + > >> + INIT_LIST_HEAD(&resv_regions); > >> + > >> + list_for_each_entry(d, &iommu->domain_list, next) { > >> + list_for_each_entry(g, &d->group_list, next) > >> + iommu_get_group_resv_regions(g->iommu_group, > >> + &resv_regions); > >> + } > >> + > >> + if (list_empty(&resv_regions)) > >> + return; > >> + > >> + list_sort(NULL, &resv_regions, vfio_resv_cmp); > >> + > >> + node = list_first_entry(&iommu->iova_list, struct vfio_iova, list); > >> + start = node->start; > >> + node = list_last_entry(&iommu->iova_list, struct vfio_iova, list); > >> + end = node->end; > > > > list_sort() only sorts based on ->start, we added reserved regions for > > all our groups to one list, we potentially have multiple entries with > > the same ->start. How can we be sure that the last one in the list > > actually has the largest ->end value? > > > >> + > >> + /* purge the iova list and create new one */ > >> + list_for_each_entry_safe(node, tmp, &iommu->iova_list, list) { > >> + list_del(&node->list); > >> + kfree(node); > >> + } > >> + > >> + if (vfio_iommu_iova_aper_adjust(iommu, start, end)) { > >> + pr_warn("%s: Failed to update iova aperture. VFIO DMA map > request may fail\n", > >> + __func__); > > > > Map requests "will" fail. Is this the right error strategy? Detaching > > a group cannot fail. Aren't we better off leaving the iova_list we had > > in place? If we cannot expand the iova aperture when a group is > > removed, a user can continue unscathed. > > > >> + goto done; > >> + } > >> + > >> + /* adjust the iova with current reserved regions */ > >> + if (vfio_iommu_iova_resv_adjust(iommu, &resv_regions)) > >> + pr_warn("%s: Failed to update iova list with reserve regions. > VFIO DMA map request may fail\n", > >> + __func__); > > > > Same. > > > >> +done: > >> + list_for_each_entry_safe(resv, resv_next, &resv_regions, list) > >> + kfree(resv); > >> +} > >> + > >> static void vfio_iommu_type1_detach_group(void *iommu_data, > >> struct iommu_group *iommu_group) > >> { > >> @@ -1617,6 +1772,8 @@ static void vfio_iommu_type1_detach_group(void > *iommu_data, > >> break; > >> } > >> > >> + vfio_iommu_iova_resv_refresh(iommu); > >> + > >> detach_group_done: > >> mutex_unlock(&iommu->lock); > >> } > >