Re: [PATCH v13 Kernel 7/7] vfio: Selective dirty page tracking if IOMMU backed device pins pages

Alex Williamson <alex.williamson@xxxxxxxxxx> · Tue, 17 Mar 2020 13:00:36 -0600

On Tue, 17 Mar 2020 23:58:38 +0530
Kirti Wankhede <kwankhede@xxxxxxxxxx> wrote:

> On 3/14/2020 2:19 AM, Alex Williamson wrote:
> > On Thu, 12 Mar 2020 23:23:27 +0530
> > Kirti Wankhede <kwankhede@xxxxxxxxxx> wrote:
> >   
> >> Added a check such that only singleton IOMMU groups can pin pages.
> >>  From the point when vendor driver pins any pages, consider IOMMU group
> >> dirty page scope to be limited to pinned pages.
> >>
> >> To optimize to avoid walking list often, added flag
> >> pinned_page_dirty_scope to indicate if all of the vfio_groups for each
> >> vfio_domain in the domain_list dirty page scope is limited to pinned
> >> pages. This flag is updated on first pinned pages request for that IOMMU
> >> group and on attaching/detaching group.
> >>
> >> Signed-off-by: Kirti Wankhede <kwankhede@xxxxxxxxxx>
> >> Reviewed-by: Neo Jia <cjia@xxxxxxxxxx>
> >> ---
> >>   drivers/vfio/vfio.c             |  9 +++++-
> >>   drivers/vfio/vfio_iommu_type1.c | 72 +++++++++++++++++++++++++++++++++++++++--
> >>   include/linux/vfio.h            |  4 ++-
> >>   3 files changed, 80 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> >> index c8482624ca34..79108c1245a5 100644
> >> --- a/drivers/vfio/vfio.c
> >> +++ b/drivers/vfio/vfio.c
> >> @@ -85,6 +85,7 @@ struct vfio_group {
> >>   	atomic_t			opened;
> >>   	wait_queue_head_t		container_q;
> >>   	bool				noiommu;
> >> +	unsigned int			dev_counter;
> >>   	struct kvm			*kvm;
> >>   	struct blocking_notifier_head	notifier;
> >>   };
> >> @@ -555,6 +556,7 @@ struct vfio_device *vfio_group_create_device(struct vfio_group *group,
> >>   
> >>   	mutex_lock(&group->device_lock);
> >>   	list_add(&device->group_next, &group->device_list);
> >> +	group->dev_counter++;
> >>   	mutex_unlock(&group->device_lock);
> >>   
> >>   	return device;
> >> @@ -567,6 +569,7 @@ static void vfio_device_release(struct kref *kref)
> >>   	struct vfio_group *group = device->group;
> >>   
> >>   	list_del(&device->group_next);
> >> +	group->dev_counter--;
> >>   	mutex_unlock(&group->device_lock);
> >>   
> >>   	dev_set_drvdata(device->dev, NULL);
> >> @@ -1895,6 +1898,9 @@ int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
> >>   	if (!group)
> >>   		return -ENODEV;
> >>   
> >> +	if (group->dev_counter > 1)
> >> +		return -EINVAL;
> >> +
> >>   	ret = vfio_group_add_container_user(group);
> >>   	if (ret)
> >>   		goto err_pin_pages;
> >> @@ -1902,7 +1908,8 @@ int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
> >>   	container = group->container;
> >>   	driver = container->iommu_driver;
> >>   	if (likely(driver && driver->ops->pin_pages))
> >> -		ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
> >> +		ret = driver->ops->pin_pages(container->iommu_data,
> >> +					     group->iommu_group, user_pfn,
> >>   					     npage, prot, phys_pfn);
> >>   	else
> >>   		ret = -ENOTTY;
> >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >> index 4f1f116feabc..18a284b230c0 100644
> >> --- a/drivers/vfio/vfio_iommu_type1.c
> >> +++ b/drivers/vfio/vfio_iommu_type1.c
> >> @@ -71,6 +71,7 @@ struct vfio_iommu {
> >>   	bool			v2;
> >>   	bool			nesting;
> >>   	bool			dirty_page_tracking;
> >> +	bool			pinned_page_dirty_scope;
> >>   };
> >>   
> >>   struct vfio_domain {
> >> @@ -98,6 +99,7 @@ struct vfio_group {
> >>   	struct iommu_group	*iommu_group;
> >>   	struct list_head	next;
> >>   	bool			mdev_group;	/* An mdev group */
> >> +	bool			has_pinned_pages;  
> > 
> > I'm afraid over time this name will be confusing, should we simply
> > call it pinned_page_dirty_scope per vfio_group as well?   
> 
> Updating as you suggested, but I hope it doesn't look confusing.
> 
> >  We might have
> > to adapt this over time as we get new ways to dirty pages, but each
> > group voting towards the same value being set on the vfio_iommu object
> > seems like a good starting point.
> >   
> >>   };
> >>   
> >>   struct vfio_iova {
> >> @@ -129,6 +131,10 @@ struct vfio_regions {
> >>   static int put_pfn(unsigned long pfn, int prot);
> >>   static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu);
> >>   
> >> +static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
> >> +					       struct iommu_group *iommu_group);
> >> +
> >> +static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu);
> >>   /*
> >>    * This code handles mapping and unmapping of user data buffers
> >>    * into DMA'ble space using the IOMMU
> >> @@ -579,11 +585,13 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
> >>   }
> >>   
> >>   static int vfio_iommu_type1_pin_pages(void *iommu_data,
> >> +				      struct iommu_group *iommu_group,
> >>   				      unsigned long *user_pfn,
> >>   				      int npage, int prot,
> >>   				      unsigned long *phys_pfn)
> >>   {
> >>   	struct vfio_iommu *iommu = iommu_data;
> >> +	struct vfio_group *group;
> >>   	int i, j, ret;
> >>   	unsigned long remote_vaddr;
> >>   	struct vfio_dma *dma;
> >> @@ -662,8 +670,14 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> >>   				   (vpfn->iova - dma->iova) >> pgshift, 1);
> >>   		}
> >>   	}
> >> -
> >>   	ret = i;
> >> +
> >> +	group = vfio_iommu_find_iommu_group(iommu, iommu_group);
> >> +	if (!group->has_pinned_pages) {
> >> +		group->has_pinned_pages = true;
> >> +		update_pinned_page_dirty_scope(iommu);
> >> +	}
> >> +
> >>   	goto pin_done;
> >>   
> >>   pin_unwind:
> >> @@ -946,8 +960,11 @@ static int vfio_iova_dirty_bitmap(struct vfio_iommu *iommu, dma_addr_t iova,
> >>   	npages = dma->size >> pgshift;
> >>   	bitmap_size = dirty_bitmap_bytes(npages);
> >>   
> >> -	/* mark all pages dirty if all pages are pinned and mapped. */
> >> -	if (dma->iommu_mapped)
> >> +	/*
> >> +	 * mark all pages dirty if any IOMMU capable device is not able
> >> +	 * to report dirty pages and all pages are pinned and mapped.
> >> +	 */
> >> +	if (!iommu->pinned_page_dirty_scope && dma->iommu_mapped)
> >>   		bitmap_set(dma->bitmap, 0, npages);
> >>   
> >>   	if (dma->bitmap) {
> >> @@ -1430,6 +1447,51 @@ static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
> >>   	return NULL;
> >>   }
> >>   
> >> +static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
> >> +					       struct iommu_group *iommu_group)
> >> +{
> >> +	struct vfio_domain *domain;
> >> +	struct vfio_group *group = NULL;
> >> +
> >> +	list_for_each_entry(domain, &iommu->domain_list, next) {
> >> +		group = find_iommu_group(domain, iommu_group);
> >> +		if (group)
> >> +			return group;
> >> +	}
> >> +
> >> +	if (iommu->external_domain)
> >> +		group = find_iommu_group(iommu->external_domain, iommu_group);
> >> +
> >> +	return group;
> >> +}
> >> +
> >> +static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu)
> >> +{
> >> +	struct vfio_domain *domain;
> >> +	struct vfio_group *group;
> >> +
> >> +	list_for_each_entry(domain, &iommu->domain_list, next) {
> >> +		list_for_each_entry(group, &domain->group_list, next) {
> >> +			if (!group->has_pinned_pages) {
> >> +				iommu->pinned_page_dirty_scope = false;
> >> +				return;
> >> +			}
> >> +		}
> >> +	}
> >> +
> >> +	if (iommu->external_domain) {
> >> +		domain = iommu->external_domain;
> >> +		list_for_each_entry(group, &domain->group_list, next) {
> >> +			if (!group->has_pinned_pages) {
> >> +				iommu->pinned_page_dirty_scope = false;
> >> +				return;
> >> +			}
> >> +		}
> >> +	}
> >> +
> >> +	iommu->pinned_page_dirty_scope = true;
> >> +}
> >> +
> >>   static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
> >>   				  phys_addr_t *base)
> >>   {
> >> @@ -1836,6 +1898,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
> >>   
> >>   			list_add(&group->next,
> >>   				 &iommu->external_domain->group_list);
> >> +			update_pinned_page_dirty_scope(iommu);
> >>   			mutex_unlock(&iommu->lock);
> >>   
> >>   			return 0;
> >> @@ -1958,6 +2021,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
> >>   done:
> >>   	/* Delete the old one and insert new iova list */
> >>   	vfio_iommu_iova_insert_copy(iommu, &iova_copy);
> >> +	update_pinned_page_dirty_scope(iommu);
> >>   	mutex_unlock(&iommu->lock);
> >>   	vfio_iommu_resv_free(&group_resv_regions);
> >>     
> > 
> > At this point we've added an iommu backed group that can't possibly
> > have pages pinned on behalf of this group yet, can't we just set
> > iommu->pinned_page_dirty_scope = false?
> >   
> 
> Right, changing.
> 
> > In the previous case, aren't we adding a non-iommu backed group, so
> > should we presume the scope is pinned pages even before we have any?  
> 
> Anyways we are updating it when pages are pinned, I think better not to 
> presume.

If there's no iommu backing then the device doesn't have access to
dirty the pages itself, how else will they get dirty?  Perhaps I was a
little use in using the word "presume", I think there's a proof that
the pages must have limited dirty-scope.

> > We could almost forego the iommu scope update, but it could be the
> > first group added if we're going to preemptively assume the scope of
> > the group.
> >   
> >> @@ -1972,6 +2036,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
> >>   out_free:
> >>   	kfree(domain);
> >>   	kfree(group);
> >> +	update_pinned_page_dirty_scope(iommu);  
> > 
> > This one looks like paranoia given how late we update when the group is
> > added.
> >   
> >>   	mutex_unlock(&iommu->lock);
> >>   	return ret;
> >>   }
> >> @@ -2176,6 +2241,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
> >>   		vfio_iommu_iova_free(&iova_copy);
> >>   
> >>   detach_group_done:
> >> +	update_pinned_page_dirty_scope(iommu);  
> > 
> > We only need to do this if the group we're removing does not have
> > pinned page dirty scope, right?  I think we have all the info here to
> > make that optimization.
> >   
> 
> There could be more than one group that doesn't have pinned page dirty 
> scope, better to run through update_pinned_page_dirty_scope() function.

Maybe I stated it wrong above, but I think we have this table:

iommu|group
-----+--------+---------+
XXXXX|   0    |    1    |
-----+--------+---------+
  0  |   A    |    B    |
-----+--------+---------+
  1  |   C    |    D    |
-----+--------+---------+

A: If we are NOT dirty-page-scope at the iommu and we remove a group
that is NOT dirty-page-scope, we need to check because that might have
been the group preventing the iommu from being dirty-page-scope.

B: If we are NOT dirty-page-scope at the iommu and we remove a group
that IS dirty-page-scope, we know that group wasn't limiting the scope
at the iommu.

C: If the iommu IS dirty-page-scope, we can't remove a group that is
NOT dirty page scope, this case is impossible.

D: If the iommu IS dirty-page-scope and we remove a group that IS dirty-
page-scope, nothing changes.

So I think we only need to update on A, or A+C since C cannot happen.
In B and D removing a group with dirt-page-scope cannot change the
iommu scope.  Thanks,

Alex

> >>   	mutex_unlock(&iommu->lock);
> >>   }
> >>   
> >> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> >> index e42a711a2800..da29802d6276 100644
> >> --- a/include/linux/vfio.h
> >> +++ b/include/linux/vfio.h
> >> @@ -72,7 +72,9 @@ struct vfio_iommu_driver_ops {
> >>   					struct iommu_group *group);
> >>   	void		(*detach_group)(void *iommu_data,
> >>   					struct iommu_group *group);
> >> -	int		(*pin_pages)(void *iommu_data, unsigned long *user_pfn,
> >> +	int		(*pin_pages)(void *iommu_data,
> >> +				     struct iommu_group *group,
> >> +				     unsigned long *user_pfn,
> >>   				     int npage, int prot,
> >>   				     unsigned long *phys_pfn);
> >>   	int		(*unpin_pages)(void *iommu_data,  
> >   
>