Hi Shameer, On 23/01/18 13:16, Shameerali Kolothum Thodi wrote: > Hi Eric, > >> -----Original Message----- >> From: Auger Eric [mailto:eric.auger@xxxxxxxxxx] >> Sent: Tuesday, January 23, 2018 8:32 AM >> To: Alex Williamson <alex.williamson@xxxxxxxxxx>; Shameerali Kolothum >> Thodi <shameerali.kolothum.thodi@xxxxxxxxxx> >> Cc: pmorel@xxxxxxxxxxxxxxxxxx; kvm@xxxxxxxxxxxxxxx; linux- >> kernel@xxxxxxxxxxxxxxx; Linuxarm <linuxarm@xxxxxxxxxx>; John Garry >> <john.garry@xxxxxxxxxx>; xuwei (O) <xuwei5@xxxxxxxxxx> >> Subject: Re: [RFC v2 2/5] vfio/type1: Check reserve region conflict and update >> iova list >> >> Hi Shameer, >> >> On 18/01/18 01:04, Alex Williamson wrote: >>> On Fri, 12 Jan 2018 16:45:28 +0000 >>> Shameer Kolothum <shameerali.kolothum.thodi@xxxxxxxxxx> wrote: >>> >>>> This retrieves the reserved regions associated with dev group and >>>> checks for conflicts with any existing dma mappings. Also update >>>> the iova list excluding the reserved regions. >>>> >>>> Signed-off-by: Shameer Kolothum >> <shameerali.kolothum.thodi@xxxxxxxxxx> >>>> --- >>>> drivers/vfio/vfio_iommu_type1.c | 161 >> +++++++++++++++++++++++++++++++++++++++- >>>> 1 file changed, 159 insertions(+), 2 deletions(-) >>>> >>>> diff --git a/drivers/vfio/vfio_iommu_type1.c >> b/drivers/vfio/vfio_iommu_type1.c >>>> index 11cbd49..7609070 100644 >>>> --- a/drivers/vfio/vfio_iommu_type1.c >>>> +++ b/drivers/vfio/vfio_iommu_type1.c >>>> @@ -28,6 +28,7 @@ >>>> #include <linux/device.h> >>>> #include <linux/fs.h> >>>> #include <linux/iommu.h> >>>> +#include <linux/list_sort.h> >>>> #include <linux/module.h> >>>> #include <linux/mm.h> >>>> #include <linux/rbtree.h> >>>> @@ -1199,6 +1200,20 @@ static bool vfio_iommu_has_sw_msi(struct >> iommu_group *group, phys_addr_t *base) >>>> return ret; >>>> } >>>> >>> >>> /* list_sort helper */ >>> >>>> +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b) >>>> +{ >>>> + struct iommu_resv_region *ra, *rb; >>>> + >>>> + ra = container_of(a, struct iommu_resv_region, list); >>>> + rb = container_of(b, struct iommu_resv_region, list); >>>> + >>>> + if (ra->start < rb->start) >>>> + return -1; >>>> + if (ra->start > rb->start) >>>> + return 1; >>>> + return 0; >>>> +} >>>> + >>>> static int vfio_insert_iova(phys_addr_t start, phys_addr_t end, >>>> struct list_head *head) >>>> { >>>> @@ -1274,6 +1289,24 @@ static int vfio_iommu_valid_aperture(struct >> vfio_iommu *iommu, >>>> } >>>> >>>> /* >>>> + * Check reserved region conflicts with existing dma mappings >>>> + */ >>>> +static int vfio_iommu_resv_region_conflict(struct vfio_iommu *iommu, >>>> + struct list_head *resv_regions) >>>> +{ >>>> + struct iommu_resv_region *region; >>>> + >>>> + /* Check for conflict with existing dma mappings */ >>>> + list_for_each_entry(region, resv_regions, list) { >>>> + if (vfio_find_dma_overlap(iommu, region->start, >>>> + region->start + region->length - 1)) >>>> + return -EINVAL; >>>> + } >>>> + >>>> + return 0; >>>> +} >>> >>> This basically does the same test as vfio_iommu_valid_aperture but >>> properly names it a conflict test. Please be consistent. Should this >>> also return bool, "conflict" is a yes/no answer. >>> >>>> + >>>> +/* >>>> * Adjust the iommu aperture window if new aperture is a valid one >>>> */ >>>> static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu, >>>> @@ -1316,6 +1349,51 @@ static int vfio_iommu_iova_aper_adjust(struct >> vfio_iommu *iommu, >>>> return 0; >>>> } >>>> >>>> +/* >>>> + * Check and update iova region list in case a reserved region >>>> + * overlaps the iommu iova range >>>> + */ >>>> +static int vfio_iommu_iova_resv_adjust(struct vfio_iommu *iommu, >>>> + struct list_head *resv_regions) >>> >>> "resv_region" in previous function, just "resv" here, use consistent >>> names. Also, what are we adjusting. Maybe "exclude" is a better term. >>> >>>> +{ >>>> + struct iommu_resv_region *resv; >>>> + struct list_head *iova = &iommu->iova_list; >>>> + struct vfio_iova *n, *next; >>>> + >>>> + list_for_each_entry(resv, resv_regions, list) { >>>> + phys_addr_t start, end; >>>> + >>>> + start = resv->start; >>>> + end = resv->start + resv->length - 1; >>>> + >>>> + list_for_each_entry_safe(n, next, iova, list) { >>>> + phys_addr_t a, b; >>>> + int ret = 0; >>>> + >>>> + a = n->start; >>>> + b = n->end; >>> >>> 'a' and 'b' variables actually make this incredibly confusing. Use >>> better variable names or just drop them entirely, it's much easier to >>> follow as n->start & n->end. >>> >>>> + /* No overlap */ >>>> + if ((start > b) || (end < a)) >>>> + continue; >>>> + /* Split the current node and create holes */ >>>> + if (start > a) >>>> + ret = vfio_insert_iova(a, start - 1, &n->list); >>>> + if (!ret && end < b) >>>> + ret = vfio_insert_iova(end + 1, b, &n->list); >>>> + if (ret) >>>> + return ret; >>>> + >>>> + list_del(&n->list); >>> >>> This is trickier than it appears and deserves some explanation. AIUI, >>> we're actually inserting duplicate entries for the remainder at the >>> start of the range and then at the end of the range (and the order is >>> important here because we're inserting each before the current node), >>> and then we delete the current node. So the iova_list is kept sorted >>> through this process, though temporarily includes some bogus, unordered >>> sub-sets. >>> >>>> + kfree(n); >>>> + } >>>> + } >>>> + >>>> + if (list_empty(iova)) >>>> + return -EINVAL; >>>> + >>>> + return 0; >>>> +} >>>> + >>>> static int vfio_iommu_type1_attach_group(void *iommu_data, >>>> struct iommu_group *iommu_group) >>>> { >>>> @@ -1327,6 +1405,8 @@ static int vfio_iommu_type1_attach_group(void >> *iommu_data, >>>> bool resv_msi, msi_remap; >>>> phys_addr_t resv_msi_base; >>>> struct iommu_domain_geometry geo; >>>> + struct list_head group_resv_regions; >>>> + struct iommu_resv_region *resv, *resv_next; >>>> >>>> mutex_lock(&iommu->lock); >>>> >>>> @@ -1404,6 +1484,14 @@ static int vfio_iommu_type1_attach_group(void >> *iommu_data, >>>> if (ret) >>>> goto out_detach; >>>> >>>> + INIT_LIST_HEAD(&group_resv_regions); >>>> + iommu_get_group_resv_regions(iommu_group, &group_resv_regions); >>>> + list_sort(NULL, &group_resv_regions, vfio_resv_cmp); >> iommu_get_group_resv_regions returns a sorted list (see >> iommu_insert_resv_regions kerneldoc comment). You can have overlapping >> regions of different types though. > > Hmm..I am not sure. It looks like it is sorted only if the regions are of same type. > > "* The new element is sorted by address with respect to the other > * regions of the same type." > > So hypothetically if there are two groups with regions like, > > Group 1. > Start size type > 0x0000 0x1000 1 > 0x2000 0x1000 1 > 0x5000 0x1000 1 > > Group 2 > Start size type > 0x2000 0x4000 2 > 0x7000 0x1000 1 > > Then the iommu_get_group_resv_regions() will return, > > 0x0000 0x1000 1 > 0x2000 0x1000 1 > 0x5000 0x1000 1 > 0x2000 0x4000 2 > 0x7000 0x1000 1 Hum yes, I remember now, sorry. It was made on purpose to avoid to display interleaved resv region types in /sys/kernel/iommu_groups/reserved_regions. I think it gives a better user experience. Thanks Eric > > But honestly I am not sure the above is a valid scenario or not. I am > happy to remove the sorting if such a case will never happen. > > Please let me know. > > Thanks, > Shameer > >> Eric >>>> + >>>> + ret = vfio_iommu_resv_region_conflict(iommu, &group_resv_regions); >>>> + if (ret) >>>> + goto out_detach; >>>> + >>>> resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base); >>>> >>>> INIT_LIST_HEAD(&domain->group_list); >>>> @@ -1434,11 +1522,15 @@ static int vfio_iommu_type1_attach_group(void >> *iommu_data, >>>> d->prot == domain->prot) { >>>> iommu_detach_group(domain->domain, >> iommu_group); >>>> if (!iommu_attach_group(d->domain, iommu_group)) { >>>> + ret = vfio_iommu_iova_resv_adjust(iommu, >>>> + >> &group_resv_regions); >>>> + if (!ret) >>>> + goto out_domain; >>> >>> The above function is not without side effects if it fails, it's >>> altered the iova_list. It needs to be valid for the remaining domains >>> if we're going to continue. >>> >>>> + >>>> list_add(&group->next, &d->group_list); >>>> iommu_domain_free(domain->domain); >>>> kfree(domain); >>>> - mutex_unlock(&iommu->lock); >>>> - return 0; >>>> + goto done; >>>> } >>>> >>>> ret = iommu_attach_group(domain->domain, >> iommu_group); >>>> @@ -1465,8 +1557,15 @@ static int vfio_iommu_type1_attach_group(void >> *iommu_data, >>>> if (ret) >>>> goto out_detach; >>>> >>>> + ret = vfio_iommu_iova_resv_adjust(iommu, &group_resv_regions); >>>> + if (ret) >>>> + goto out_detach; >>> >>> Can't we process the reserved regions once before we get here rather >>> than have two separate call points that do the same thing? In order to >>> roll back from errors above, it seems like we need to copy iova_list >>> and work on the copy, installing it and deleting the original only on >>> success. >>> >>>> + >>>> list_add(&domain->next, &iommu->domain_list); >>>> >>>> +done: >>>> + list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list) >>>> + kfree(resv); >>>> mutex_unlock(&iommu->lock); >>>> >>>> return 0; >>>> @@ -1475,6 +1574,8 @@ static int vfio_iommu_type1_attach_group(void >> *iommu_data, >>>> iommu_detach_group(domain->domain, iommu_group); >>>> out_domain: >>>> iommu_domain_free(domain->domain); >>>> + list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list) >>>> + kfree(resv); >>>> out_free: >>>> kfree(domain); >>>> kfree(group); >>>> @@ -1559,6 +1660,60 @@ static void vfio_iommu_iova_aper_refresh(struct >> vfio_iommu *iommu) >>>> node->end = end; >>>> } >>>> >>>> +/* >>>> + * Called when a group is detached. The reserved regions for that >>>> + * group can be part of valid iova now. But since reserved regions >>>> + * may be duplicated among groups, populate the iova valid regions >>>> + list again. >>>> + */ >>>> +static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu) >>>> +{ >>>> + struct vfio_domain *d; >>>> + struct vfio_group *g; >>>> + struct vfio_iova *node, *tmp; >>>> + struct iommu_resv_region *resv, *resv_next; >>>> + struct list_head resv_regions; >>>> + phys_addr_t start, end; >>>> + >>>> + INIT_LIST_HEAD(&resv_regions); >>>> + >>>> + list_for_each_entry(d, &iommu->domain_list, next) { >>>> + list_for_each_entry(g, &d->group_list, next) >>>> + iommu_get_group_resv_regions(g->iommu_group, >>>> + &resv_regions); >>>> + } >>>> + >>>> + if (list_empty(&resv_regions)) >>>> + return; >>>> + >>>> + list_sort(NULL, &resv_regions, vfio_resv_cmp); >>>> + >>>> + node = list_first_entry(&iommu->iova_list, struct vfio_iova, list); >>>> + start = node->start; >>>> + node = list_last_entry(&iommu->iova_list, struct vfio_iova, list); >>>> + end = node->end; >>> >>> list_sort() only sorts based on ->start, we added reserved regions for >>> all our groups to one list, we potentially have multiple entries with >>> the same ->start. How can we be sure that the last one in the list >>> actually has the largest ->end value? >>> >>>> + >>>> + /* purge the iova list and create new one */ >>>> + list_for_each_entry_safe(node, tmp, &iommu->iova_list, list) { >>>> + list_del(&node->list); >>>> + kfree(node); >>>> + } >>>> + >>>> + if (vfio_iommu_iova_aper_adjust(iommu, start, end)) { >>>> + pr_warn("%s: Failed to update iova aperture. VFIO DMA map >> request may fail\n", >>>> + __func__); >>> >>> Map requests "will" fail. Is this the right error strategy? Detaching >>> a group cannot fail. Aren't we better off leaving the iova_list we had >>> in place? If we cannot expand the iova aperture when a group is >>> removed, a user can continue unscathed. >>> >>>> + goto done; >>>> + } >>>> + >>>> + /* adjust the iova with current reserved regions */ >>>> + if (vfio_iommu_iova_resv_adjust(iommu, &resv_regions)) >>>> + pr_warn("%s: Failed to update iova list with reserve regions. >> VFIO DMA map request may fail\n", >>>> + __func__); >>> >>> Same. >>> >>>> +done: >>>> + list_for_each_entry_safe(resv, resv_next, &resv_regions, list) >>>> + kfree(resv); >>>> +} >>>> + >>>> static void vfio_iommu_type1_detach_group(void *iommu_data, >>>> struct iommu_group *iommu_group) >>>> { >>>> @@ -1617,6 +1772,8 @@ static void vfio_iommu_type1_detach_group(void >> *iommu_data, >>>> break; >>>> } >>>> >>>> + vfio_iommu_iova_resv_refresh(iommu); >>>> + >>>> detach_group_done: >>>> mutex_unlock(&iommu->lock); >>>> } >>>