Hi Shameer, On 18/01/18 01:04, Alex Williamson wrote: > On Fri, 12 Jan 2018 16:45:28 +0000 > Shameer Kolothum <shameerali.kolothum.thodi@xxxxxxxxxx> wrote: > >> This retrieves the reserved regions associated with dev group and >> checks for conflicts with any existing dma mappings. Also update >> the iova list excluding the reserved regions. >> >> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@xxxxxxxxxx> >> --- >> drivers/vfio/vfio_iommu_type1.c | 161 +++++++++++++++++++++++++++++++++++++++- >> 1 file changed, 159 insertions(+), 2 deletions(-) >> >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c >> index 11cbd49..7609070 100644 >> --- a/drivers/vfio/vfio_iommu_type1.c >> +++ b/drivers/vfio/vfio_iommu_type1.c >> @@ -28,6 +28,7 @@ >> #include <linux/device.h> >> #include <linux/fs.h> >> #include <linux/iommu.h> >> +#include <linux/list_sort.h> >> #include <linux/module.h> >> #include <linux/mm.h> >> #include <linux/rbtree.h> >> @@ -1199,6 +1200,20 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base) >> return ret; >> } >> > > /* list_sort helper */ > >> +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b) >> +{ >> + struct iommu_resv_region *ra, *rb; >> + >> + ra = container_of(a, struct iommu_resv_region, list); >> + rb = container_of(b, struct iommu_resv_region, list); >> + >> + if (ra->start < rb->start) >> + return -1; >> + if (ra->start > rb->start) >> + return 1; >> + return 0; >> +} >> + >> static int vfio_insert_iova(phys_addr_t start, phys_addr_t end, >> struct list_head *head) >> { >> @@ -1274,6 +1289,24 @@ static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu, >> } >> >> /* >> + * Check reserved region conflicts with existing dma mappings >> + */ >> +static int vfio_iommu_resv_region_conflict(struct vfio_iommu *iommu, >> + struct list_head *resv_regions) >> +{ >> + struct iommu_resv_region *region; >> + >> + /* Check for conflict with existing dma mappings */ >> + list_for_each_entry(region, resv_regions, list) { >> + if (vfio_find_dma_overlap(iommu, region->start, >> + region->start + region->length - 1)) >> + return -EINVAL; >> + } >> + >> + return 0; >> +} > > This basically does the same test as vfio_iommu_valid_aperture but > properly names it a conflict test. Please be consistent. Should this > also return bool, "conflict" is a yes/no answer. > >> + >> +/* >> * Adjust the iommu aperture window if new aperture is a valid one >> */ >> static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu, >> @@ -1316,6 +1349,51 @@ static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu, >> return 0; >> } >> >> +/* >> + * Check and update iova region list in case a reserved region >> + * overlaps the iommu iova range >> + */ >> +static int vfio_iommu_iova_resv_adjust(struct vfio_iommu *iommu, >> + struct list_head *resv_regions) > > "resv_region" in previous function, just "resv" here, use consistent > names. Also, what are we adjusting. Maybe "exclude" is a better term. > >> +{ >> + struct iommu_resv_region *resv; >> + struct list_head *iova = &iommu->iova_list; >> + struct vfio_iova *n, *next; >> + >> + list_for_each_entry(resv, resv_regions, list) { >> + phys_addr_t start, end; >> + >> + start = resv->start; >> + end = resv->start + resv->length - 1; >> + >> + list_for_each_entry_safe(n, next, iova, list) { >> + phys_addr_t a, b; >> + int ret = 0; >> + >> + a = n->start; >> + b = n->end; > > 'a' and 'b' variables actually make this incredibly confusing. Use > better variable names or just drop them entirely, it's much easier to > follow as n->start & n->end. > >> + /* No overlap */ >> + if ((start > b) || (end < a)) >> + continue; >> + /* Split the current node and create holes */ >> + if (start > a) >> + ret = vfio_insert_iova(a, start - 1, &n->list); >> + if (!ret && end < b) >> + ret = vfio_insert_iova(end + 1, b, &n->list); >> + if (ret) >> + return ret; >> + >> + list_del(&n->list); > > This is trickier than it appears and deserves some explanation. AIUI, > we're actually inserting duplicate entries for the remainder at the > start of the range and then at the end of the range (and the order is > important here because we're inserting each before the current node), > and then we delete the current node. So the iova_list is kept sorted > through this process, though temporarily includes some bogus, unordered > sub-sets. > >> + kfree(n); >> + } >> + } >> + >> + if (list_empty(iova)) >> + return -EINVAL; >> + >> + return 0; >> +} >> + >> static int vfio_iommu_type1_attach_group(void *iommu_data, >> struct iommu_group *iommu_group) >> { >> @@ -1327,6 +1405,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, >> bool resv_msi, msi_remap; >> phys_addr_t resv_msi_base; >> struct iommu_domain_geometry geo; >> + struct list_head group_resv_regions; >> + struct iommu_resv_region *resv, *resv_next; >> >> mutex_lock(&iommu->lock); >> >> @@ -1404,6 +1484,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, >> if (ret) >> goto out_detach; >> >> + INIT_LIST_HEAD(&group_resv_regions); >> + iommu_get_group_resv_regions(iommu_group, &group_resv_regions); >> + list_sort(NULL, &group_resv_regions, vfio_resv_cmp); iommu_get_group_resv_regions returns a sorted list (see iommu_insert_resv_regions kerneldoc comment). You can have overlapping regions of different types though. Thanks Eric >> + >> + ret = vfio_iommu_resv_region_conflict(iommu, &group_resv_regions); >> + if (ret) >> + goto out_detach; >> + >> resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base); >> >> INIT_LIST_HEAD(&domain->group_list); >> @@ -1434,11 +1522,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, >> d->prot == domain->prot) { >> iommu_detach_group(domain->domain, iommu_group); >> if (!iommu_attach_group(d->domain, iommu_group)) { >> + ret = vfio_iommu_iova_resv_adjust(iommu, >> + &group_resv_regions); >> + if (!ret) >> + goto out_domain; > > The above function is not without side effects if it fails, it's > altered the iova_list. It needs to be valid for the remaining domains > if we're going to continue. > >> + >> list_add(&group->next, &d->group_list); >> iommu_domain_free(domain->domain); >> kfree(domain); >> - mutex_unlock(&iommu->lock); >> - return 0; >> + goto done; >> } >> >> ret = iommu_attach_group(domain->domain, iommu_group); >> @@ -1465,8 +1557,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, >> if (ret) >> goto out_detach; >> >> + ret = vfio_iommu_iova_resv_adjust(iommu, &group_resv_regions); >> + if (ret) >> + goto out_detach; > > Can't we process the reserved regions once before we get here rather > than have two separate call points that do the same thing? In order to > roll back from errors above, it seems like we need to copy iova_list > and work on the copy, installing it and deleting the original only on > success. > >> + >> list_add(&domain->next, &iommu->domain_list); >> >> +done: >> + list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list) >> + kfree(resv); >> mutex_unlock(&iommu->lock); >> >> return 0; >> @@ -1475,6 +1574,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, >> iommu_detach_group(domain->domain, iommu_group); >> out_domain: >> iommu_domain_free(domain->domain); >> + list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list) >> + kfree(resv); >> out_free: >> kfree(domain); >> kfree(group); >> @@ -1559,6 +1660,60 @@ static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu) >> node->end = end; >> } >> >> +/* >> + * Called when a group is detached. The reserved regions for that >> + * group can be part of valid iova now. But since reserved regions >> + * may be duplicated among groups, populate the iova valid regions >> + list again. >> + */ >> +static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu) >> +{ >> + struct vfio_domain *d; >> + struct vfio_group *g; >> + struct vfio_iova *node, *tmp; >> + struct iommu_resv_region *resv, *resv_next; >> + struct list_head resv_regions; >> + phys_addr_t start, end; >> + >> + INIT_LIST_HEAD(&resv_regions); >> + >> + list_for_each_entry(d, &iommu->domain_list, next) { >> + list_for_each_entry(g, &d->group_list, next) >> + iommu_get_group_resv_regions(g->iommu_group, >> + &resv_regions); >> + } >> + >> + if (list_empty(&resv_regions)) >> + return; >> + >> + list_sort(NULL, &resv_regions, vfio_resv_cmp); >> + >> + node = list_first_entry(&iommu->iova_list, struct vfio_iova, list); >> + start = node->start; >> + node = list_last_entry(&iommu->iova_list, struct vfio_iova, list); >> + end = node->end; > > list_sort() only sorts based on ->start, we added reserved regions for > all our groups to one list, we potentially have multiple entries with > the same ->start. How can we be sure that the last one in the list > actually has the largest ->end value? > >> + >> + /* purge the iova list and create new one */ >> + list_for_each_entry_safe(node, tmp, &iommu->iova_list, list) { >> + list_del(&node->list); >> + kfree(node); >> + } >> + >> + if (vfio_iommu_iova_aper_adjust(iommu, start, end)) { >> + pr_warn("%s: Failed to update iova aperture. VFIO DMA map request may fail\n", >> + __func__); > > Map requests "will" fail. Is this the right error strategy? Detaching > a group cannot fail. Aren't we better off leaving the iova_list we had > in place? If we cannot expand the iova aperture when a group is > removed, a user can continue unscathed. > >> + goto done; >> + } >> + >> + /* adjust the iova with current reserved regions */ >> + if (vfio_iommu_iova_resv_adjust(iommu, &resv_regions)) >> + pr_warn("%s: Failed to update iova list with reserve regions. VFIO DMA map request may fail\n", >> + __func__); > > Same. > >> +done: >> + list_for_each_entry_safe(resv, resv_next, &resv_regions, list) >> + kfree(resv); >> +} >> + >> static void vfio_iommu_type1_detach_group(void *iommu_data, >> struct iommu_group *iommu_group) >> { >> @@ -1617,6 +1772,8 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, >> break; >> } >> >> + vfio_iommu_iova_resv_refresh(iommu); >> + >> detach_group_done: >> mutex_unlock(&iommu->lock); >> } >