[PATCH 2/5] KVM: gmem: Don't clear pages that have already been prepared

Michael Roth <michael.roth@xxxxxxx> · Thu, 12 Dec 2024 00:36:32 -0600

Currently kvm_gmem_prepare_folio() and kvm_gmem_mark_prepared() try to
use the folio order to determine the range of PFNs that needs to be
cleared before usage and subsequently marked prepared. There may however
be cases, at least once hugepage support is added, where some PFNs may
have been previously prepared when kvm_gmem_prepare_folio() was called
with a smaller max_order than the current one, and this can lead to the
current code attempting to clear pages that have already been prepared.

It also makes sense to provide more control to the caller over what
order to use, since interfaces like kvm_gmem_populate() might
specifically want to prepare sub-ranges while leaving other PFNs within
the folio in an unprepared state. It could be argued that
opportunistically preparing additional pages isn't necessarily a bad
thing, but this will complicate things down the road when future uses
cases like using gmem for both shared/private guest memory come along.

Address these issues by allowing the callers of
kvm_gmem_prepare_folio()/kvm_gmem_mark_prepared() to explicitly specify
the order of the range being prepared, and in cases where these ranges
overlap with previously-prepared pages, do not attempt to re-clear the
pages.

Signed-off-by: Michael Roth <michael.roth@xxxxxxx>
---
 virt/kvm/guest_memfd.c | 106 ++++++++++++++++++++++++++---------------
 1 file changed, 68 insertions(+), 38 deletions(-)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index aa0038ddf4a4..6907ae9fe149 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -96,15 +96,15 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
 }
 
 static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
-				    pgoff_t index, struct folio *folio)
+				    pgoff_t index, struct folio *folio, int max_order)
 {
 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
 	kvm_pfn_t pfn = folio_file_pfn(folio, index);
 	gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff;
-	int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio));
+	int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, max_order);
 	if (rc) {
-		pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
-				    index, gfn, pfn, rc);
+		pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx max_order %d error %d.\n",
+				    index, gfn, pfn, max_order, rc);
 		return rc;
 	}
 #endif
@@ -148,15 +148,15 @@ static bool bitmap_test_allset_word(unsigned long *p, unsigned long start, unsig
 	return (*p & mask_to_set) == mask_to_set;
 }
 
-static void kvm_gmem_mark_prepared(struct file *file, pgoff_t index, struct folio *folio)
+static void kvm_gmem_mark_prepared(struct file *file, pgoff_t index, int order)
 {
 	struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)file->f_inode->i_private;
-	unsigned long *p = i_gmem->prepared + BIT_WORD(index);
-	unsigned long npages = folio_nr_pages(folio);
+	unsigned long npages = (1ul << order);
+	unsigned long *p;
 
-	/* Folios must be naturally aligned */
-	WARN_ON_ONCE(index & (npages - 1));
+	/* The index isn't necessarily aligned to the requested order. */
 	index &= ~(npages - 1);
+	p = i_gmem->prepared + BIT_WORD(index);
 
 	/* Clear page before updating bitmap.  */
 	smp_wmb();
@@ -193,16 +193,16 @@ static void kvm_gmem_mark_range_unprepared(struct inode *inode, pgoff_t index, p
 		bitmap_clear_atomic_word(p++, 0, npages);
 }
 
-static bool kvm_gmem_is_prepared(struct file *file, pgoff_t index, struct folio *folio)
+static bool kvm_gmem_is_prepared(struct file *file, pgoff_t index, int order)
 {
 	struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)file->f_inode->i_private;
-	unsigned long *p = i_gmem->prepared + BIT_WORD(index);
-	unsigned long npages = folio_nr_pages(folio);
+	unsigned long npages = (1ul << order);
+	unsigned long *p;
 	bool ret;
 
-	/* Folios must be naturally aligned */
-	WARN_ON_ONCE(index & (npages - 1));
+	/* The index isn't necessarily aligned to the requested order. */
 	index &= ~(npages - 1);
+	p = i_gmem->prepared + BIT_WORD(index);
 
 	if (npages < BITS_PER_LONG) {
 		ret = bitmap_test_allset_word(p, index, npages);
@@ -226,35 +226,41 @@ static bool kvm_gmem_is_prepared(struct file *file, pgoff_t index, struct folio
  */
 static int kvm_gmem_prepare_folio(struct kvm *kvm, struct file *file,
 				  struct kvm_memory_slot *slot,
-				  gfn_t gfn, struct folio *folio)
+				  gfn_t gfn, struct folio *folio, int max_order)
 {
 	unsigned long nr_pages, i;
-	pgoff_t index;
+	pgoff_t index, aligned_index;
 	int r;
 
-	nr_pages = folio_nr_pages(folio);
+	index = gfn - slot->base_gfn + slot->gmem.pgoff;
+	nr_pages = (1ull << max_order);
+	WARN_ON(nr_pages > folio_nr_pages(folio));
+	aligned_index = ALIGN_DOWN(index, nr_pages);
+
 	for (i = 0; i < nr_pages; i++)
-		clear_highpage(folio_page(folio, i));
+		if (!kvm_gmem_is_prepared(file, aligned_index + i, 0))
+			clear_highpage(folio_page(folio, aligned_index - folio_index(folio) + i));
 
 	/*
-	 * Preparing huge folios should always be safe, since it should
-	 * be possible to split them later if needed.
-	 *
-	 * Right now the folio order is always going to be zero, but the
-	 * code is ready for huge folios.  The only assumption is that
-	 * the base pgoff of memslots is naturally aligned with the
-	 * requested page order, ensuring that huge folios can also use
-	 * huge page table entries for GPA->HPA mapping.
+	 * In cases where only a sub-range of a folio is prepared, e.g. via
+	 * calling kvm_gmem_populate() for a non-aligned GPA range, or when
+	 * there's a mix of private/shared attributes for the GPA range that
+	 * the folio backs, it's possible that later on the same folio might
+	 * be accessed with a larger order when it becomes possible to map
+	 * the full GPA range into the guest using a larger order. In such
+	 * cases, some sub-ranges might already have been prepared.
 	 *
-	 * The order will be passed when creating the guest_memfd, and
-	 * checked when creating memslots.
+	 * Because of this, the arch-specific callbacks should be expected
+	 * to handle dealing with cases where some sub-ranges are already
+	 * in a prepared state, since the alternative would involve needing
+	 * to issue multiple prepare callbacks with finer granularity, and
+	 * potentially obfuscating cases where arch-specific callbacks can
+	 * be notified of larger-order mappings and potentially optimize
+	 * preparation based on that knowledge.
 	 */
-	WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio)));
-	index = gfn - slot->base_gfn + slot->gmem.pgoff;
-	index = ALIGN_DOWN(index, 1 << folio_order(folio));
-	r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
+	r = __kvm_gmem_prepare_folio(kvm, slot, index, folio, max_order);
 	if (!r)
-		kvm_gmem_mark_prepared(file, index, folio);
+		kvm_gmem_mark_prepared(file, index, max_order);
 
 	return r;
 }
@@ -812,20 +818,31 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 {
 	pgoff_t index = kvm_gmem_get_index(slot, gfn);
 	struct file *file = kvm_gmem_get_file(slot);
+	int max_order_local;
 	struct folio *folio;
 	int r = 0;
 
 	if (!file)
 		return -EFAULT;
 
-	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
+	/*
+	 * The caller might pass a NULL 'max_order', but internally this
+	 * function needs to be aware of any order limitations set by
+	 * __kvm_gmem_get_pfn() so the scope of preparation operations can
+	 * be limited to the corresponding range. The initial order can be
+	 * arbitrarily large, but gmem doesn't currently support anything
+	 * greater than PMD_ORDER so use that for now.
+	 */
+	max_order_local = PMD_ORDER;
+
+	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &max_order_local);
 	if (IS_ERR(folio)) {
 		r = PTR_ERR(folio);
 		goto out;
 	}
 
-	if (kvm_gmem_is_prepared(file, index, folio))
-		r = kvm_gmem_prepare_folio(kvm, file, slot, gfn, folio);
+	if (!kvm_gmem_is_prepared(file, index, max_order_local))
+		r = kvm_gmem_prepare_folio(kvm, file, slot, gfn, folio, max_order_local);
 
 	folio_unlock(folio);
 
@@ -835,6 +852,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 		folio_put(folio);
 
 out:
+	if (max_order)
+		*max_order = max_order_local;
 	fput(file);
 	return r;
 }
@@ -877,13 +896,24 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
 			break;
 		}
 
+		/*
+		 * The max order shouldn't extend beyond the GFN range being
+		 * populated in this iteration, so set max_order accordingly.
+		 * __kvm_gmem_get_pfn() will then further adjust the order to
+		 * one that is contained by the backing memslot/folio.
+		 */
+		max_order = 0;
+		while (IS_ALIGNED(gfn, 1 << (max_order + 1)) &&
+		       (npages - i >= (1 << (max_order + 1))))
+			max_order++;
+
 		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &max_order);
 		if (IS_ERR(folio)) {
 			ret = PTR_ERR(folio);
 			break;
 		}
 
-		if (kvm_gmem_is_prepared(file, index, folio)) {
+		if (kvm_gmem_is_prepared(file, index, max_order)) {
 			folio_unlock(folio);
 			folio_put(folio);
 			ret = -EEXIST;
@@ -907,7 +937,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
 		ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
 		if (!ret) {
 			pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
-			kvm_gmem_mark_prepared(file, index, folio);
+			kvm_gmem_mark_prepared(file, index, max_order);
 		}
 
 put_folio_and_exit:
-- 
2.25.1