Re: [PATCH v3 8/15] iommufd: Algorithms for PFN storage

Jason Gunthorpe <jgg@xxxxxxxxxx> · Thu, 3 Nov 2022 17:08:08 -0300

On Tue, Oct 25, 2022 at 03:12:17PM -0300, Jason Gunthorpe wrote:
> +
> +/**
> + * iopt_area_fill_domains() - Install PFNs into the area's domains
> + * @area: The area to act on
> + * @pages: The pages associated with the area (area->pages is NULL)
> + *
> + * Called during area creation. The area is freshly created and not inserted in
> + * the domains_itree yet. PFNs are read and loaded into every domain held in the
> + * area's io_pagetable and the area is installed in the domains_itree.
> + *
> + * On failure all domains are left unchanged.
> + */
> +int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages)
> +{
> +	struct pfn_reader pfns;
> +	struct iommu_domain *domain;
> +	unsigned long unmap_index;
> +	unsigned long index;
> +	int rc;
> +
> +	lockdep_assert_held(&area->iopt->domains_rwsem);
> +
> +	if (xa_empty(&area->iopt->domains))
> +		return 0;
> +
> +	mutex_lock(&pages->mutex);
> +	rc = pfn_reader_first(&pfns, pages, iopt_area_index(area),
> +			      iopt_area_last_index(area));
> +	if (rc)
> +		goto out_unlock;
> +
> +	while (!pfn_reader_done(&pfns)) {
> +		xa_for_each(&area->iopt->domains, index, domain) {
> +			rc = batch_to_domain(&pfns.batch, domain, area,
> +					     pfns.batch_start_index);
> +			if (rc)
> +				goto out_unmap;
> +		}
> +
> +		rc = pfn_reader_next(&pfns);
> +		if (rc)
> +			goto out_unmap;
> +	}
> +	rc = pfn_reader_update_pinned(&pfns);
> +	if (rc)
> +		goto out_unmap;
> +
> +	area->storage_domain = xa_load(&area->iopt->domains, 0);
> +	interval_tree_insert(&area->pages_node, &pages->domains_itree);
> +	goto out_destroy;
> +
> +out_unmap:
> +	xa_for_each(&area->iopt->domains, unmap_index, domain) {
> +		unsigned long end_index = pfns.batch_start_index;
> +
> +		if (unmap_index <= index)
> +			end_index = pfns.batch_end_index;

syzkaller found that there is a typo here, it should be <

However, I wasn't able to make a quick reproduction for something that
should have a been a very reliable failure path using nth fault
injection. This led to a great big adventure where I discovered that
fault injection and xarray do not play nicely together:

https://lore.kernel.org/r/Y2QR0EDvq7p9i1xw@xxxxxxxxxx

Which ended up spending a whole bunch of time to add a nth failure
study to the test suite and understand what is going on and how to
make it work better. It now covers this scenario deterministically.

The exhaustive nth failure study also shows this error handling has
another, more serious, problem. We keep track of how many pages have
been pinned inside the pages, and we also keep track of the last
charge to the rlimit/etc. At the end of operations these are
reconciled. There are lots of assertions checking that this is being
tracked properly so that we don't loose track of a pinned page in the
very complicated logic.

The new test suite discovered this missing:

 		/* Any pages not transferred to the batch are just unpinned */
 		unpin_user_pages(pfns->user.upages + (pfns->batch_end_index -
 						      pfns->user.upages_start),
 				 npages);
+		iopt_pages_sub_npinned(pages, npages);

We need to charge back the as-yet-unprocessed pages we are unpinning
when destroying the batch.

And then we get into trouble that things are not happening in the
order the assertions would like as this:

> +			iopt_area_unfill_partial_domain(area, pages, domain,
> +							end_index);

Demands that the pinned page charge during unfilling be only
decreasing, never increasing. However we can still be holding pinned
pages in the batch at this instant that don't get cleaned up until:

> +		}
> +	}
> +out_destroy:
> +	pfn_reader_destroy(&pfns);

Here

Thus the assertions get unhappy.

Introducing a pfn_reader_release_pins() which is called before
unfilling gets everything in the right order and the testing of these
two functions now passes, though I still have to insert a few more
error injection points to get full coverage.

Syzkaller has found another 4 things I still have to look at and is
now sitting at 65%(72%) coverage. So steadily progressing..

Jason

diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index 245e7b96902107..ce707d6f5ee959 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -994,7 +994,15 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages,
 	return 0;
 }
 
-static void pfn_reader_destroy(struct pfn_reader *pfns)
+/*
+ * There are many assertions regarding the state of pages->npinned vs
+ * pages->last_pinned, for instance something like unmapping a domain must only
+ * decrement the npinned, and pfn_reader_destroy() must be called only after all
+ * the pins are updated. This is fine for success flows, but error flows
+ * sometimes need to release the pins held inside the pfn_reader before going on
+ * to complete unmapping and releasing pins held in domains.
+ */
+static void pfn_reader_release_pins(struct pfn_reader *pfns)
 {
 	struct iopt_pages *pages = pfns->pages;
 
@@ -1005,12 +1013,20 @@ static void pfn_reader_destroy(struct pfn_reader *pfns)
 		unpin_user_pages(pfns->user.upages + (pfns->batch_end_index -
 						      pfns->user.upages_start),
 				 npages);
+		iopt_pages_sub_npinned(pages, npages);
+		pfns->user.upages_end = pfns->batch_end_index;
 	}
-
-	pfn_reader_user_destroy(&pfns->user, pfns->pages);
-
 	if (pfns->batch_start_index != pfns->batch_end_index)
 		pfn_reader_unpin(pfns);
+	pfns->batch_start_index = pfns->batch_end_index;
+}
+
+static void pfn_reader_destroy(struct pfn_reader *pfns)
+{
+	struct iopt_pages *pages = pfns->pages;
+
+	pfn_reader_release_pins(pfns);
+	pfn_reader_user_destroy(&pfns->user, pfns->pages);
 	batch_destroy(&pfns->batch, NULL);
 	WARN_ON(pages->last_npinned != pages->npinned);
 }
@@ -1223,6 +1239,7 @@ void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages,
  */
 int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
 {
+	unsigned long done_end_index;
 	struct pfn_reader pfns;
 	int rc;
 
@@ -1234,10 +1251,12 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
 		return rc;
 
 	while (!pfn_reader_done(&pfns)) {
+		done_end_index = pfns.batch_start_index;
 		rc = batch_to_domain(&pfns.batch, domain, area,
 				     pfns.batch_start_index);
 		if (rc)
 			goto out_unmap;
+		done_end_index = pfns.batch_end_index;
 
 		rc = pfn_reader_next(&pfns);
 		if (rc)
@@ -1250,8 +1269,9 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
 	goto out_destroy;
 
 out_unmap:
+	pfn_reader_release_pins(&pfns);
 	iopt_area_unfill_partial_domain(area, area->pages, domain,
-					pfns.batch_start_index);
+					done_end_index);
 out_destroy:
 	pfn_reader_destroy(&pfns);
 	return rc;
@@ -1270,9 +1290,11 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
  */
 int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages)
 {
-	struct pfn_reader pfns;
+	unsigned long done_first_end_index;
+	unsigned long done_all_end_index;
 	struct iommu_domain *domain;
 	unsigned long unmap_index;
+	struct pfn_reader pfns;
 	unsigned long index;
 	int rc;
 
@@ -1288,12 +1310,15 @@ int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages)
 		goto out_unlock;
 
 	while (!pfn_reader_done(&pfns)) {
+		done_first_end_index = pfns.batch_end_index;
+		done_all_end_index = pfns.batch_start_index;
 		xa_for_each(&area->iopt->domains, index, domain) {
 			rc = batch_to_domain(&pfns.batch, domain, area,
 					     pfns.batch_start_index);
 			if (rc)
 				goto out_unmap;
 		}
+		done_all_end_index = done_first_end_index;
 
 		rc = pfn_reader_next(&pfns);
 		if (rc)
@@ -1308,11 +1333,14 @@ int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages)
 	goto out_destroy;
 
 out_unmap:
+	pfn_reader_release_pins(&pfns);
 	xa_for_each(&area->iopt->domains, unmap_index, domain) {
-		unsigned long end_index = pfns.batch_start_index;
+		unsigned long end_index;
 
-		if (unmap_index <= index)
-			end_index = pfns.batch_end_index;
+		if (unmap_index < index)
+			end_index = done_first_end_index;
+		else
+			end_index = done_all_end_index;
 
 		/*
 		 * The area is not yet part of the domains_itree so we have to