[PATCH 5/5] KVM: Add hugepage support for dedicated guest memory

Michael Roth <michael.roth@xxxxxxx> · Thu, 12 Dec 2024 00:36:35 -0600

From: Sean Christopherson <seanjc@xxxxxxxxxx>

Extended guest_memfd to allow backing guest memory with hugepages. This
is done as a best-effort by default until a better-defined mechanism is
put in place that can provide better control/assurances to userspace
about hugepage allocations.

When reporting the max order when KVM gets a pfn from guest_memfd, force
order-0 pages if the hugepage is not fully contained by the memslot
binding, e.g. if userspace requested hugepages but punches a hole in the
memslot bindings in order to emulate x86's VGA hole.

Link: https://lore.kernel.org/kvm/20231027182217.3615211-1-seanjc@xxxxxxxxxx/T/#mccbd3e8bf9897f0ddbf864e6318d6f2f208b269c
Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx>
Message-Id: <20231027182217.3615211-18-seanjc@xxxxxxxxxx>
[Allow even with CONFIG_TRANSPARENT_HUGEPAGE; dropped momentarily due to
 uneasiness about the API. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
[mdr: based on discussion in the Link regarding original patch, make the
      following set of changes:
      - For now, don't introduce an opt-in flag to enable hugepage
        support. By default, just make a best-effort for PMD_ORDER
        allocations so that there are no false assurances to userspace
        that they'll get hugepages. Performance-wise, it's better at
        least than the current guarantee that they will get 4K pages
        every time. A more proper opt-in interface can then improve on
        things later.
      - Pass GFP_NOWARN to alloc_pages() so failures are not disruptive
        to normal operations
      - Drop size checks during creation time. Instead just avoid huge
        allocations if they extend beyond end of the memfd.
      - Drop hugepage-related unit tests since everything is now handled
        transparently to userspace anyway.
      - Update commit message accordingly.]
Signed-off-by: Michael Roth <michael.roth@xxxxxxx>
---
 include/linux/kvm_host.h |  2 ++
 virt/kvm/guest_memfd.c   | 68 +++++++++++++++++++++++++++++++---------
 virt/kvm/kvm_main.c      |  4 +++
 3 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c7e4f8be3e17..c946ec98d614 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2278,6 +2278,8 @@ extern unsigned int halt_poll_ns_grow;
 extern unsigned int halt_poll_ns_grow_start;
 extern unsigned int halt_poll_ns_shrink;
 
+extern unsigned int gmem_2m_enabled;
+
 struct kvm_device {
 	const struct kvm_device_ops *ops;
 	struct kvm *kvm;
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 9a5172de6a03..d0caec99fe03 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -273,6 +273,36 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct file *file,
 	return r;
 }
 
+static struct folio *kvm_gmem_get_huge_folio(struct inode *inode, pgoff_t index,
+					     unsigned int order)
+{
+	pgoff_t npages = 1UL << order;
+	pgoff_t huge_index = round_down(index, npages);
+	struct address_space *mapping  = inode->i_mapping;
+	gfp_t gfp = mapping_gfp_mask(mapping) | __GFP_NOWARN;
+	loff_t size = i_size_read(inode);
+	struct folio *folio;
+
+	/* Make sure hugepages would be fully-contained by inode */
+	if ((huge_index + npages) * PAGE_SIZE > size)
+		return NULL;
+
+	if (filemap_range_has_page(mapping, (loff_t)huge_index << PAGE_SHIFT,
+				   (loff_t)(huge_index + npages - 1) << PAGE_SHIFT))
+		return NULL;
+
+	folio = filemap_alloc_folio(gfp, order);
+	if (!folio)
+		return NULL;
+
+	if (filemap_add_folio(mapping, folio, huge_index, gfp)) {
+		folio_put(folio);
+		return NULL;
+	}
+
+	return folio;
+}
+
 /*
  * Returns a locked folio on success.  The caller is responsible for
  * setting the up-to-date flag before the memory is mapped into the guest.
@@ -284,8 +314,15 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct file *file,
  */
 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
 {
-	/* TODO: Support huge pages. */
-	return filemap_grab_folio(inode->i_mapping, index);
+	struct folio *folio = NULL;
+
+	if (gmem_2m_enabled)
+		folio = kvm_gmem_get_huge_folio(inode, index, PMD_ORDER);
+
+	if (!folio)
+		folio = filemap_grab_folio(inode->i_mapping, index);
+
+	return folio;
 }
 
 static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
@@ -660,6 +697,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 	inode->i_size = size;
 	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
 	mapping_set_inaccessible(inode->i_mapping);
+	mapping_set_large_folios(inode->i_mapping);
 	/* Unmovable mappings are supposed to be marked unevictable as well. */
 	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
 
@@ -791,6 +829,7 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file,
 {
 	struct kvm_gmem *gmem = file->private_data;
 	struct folio *folio;
+	pgoff_t huge_index;
 
 	if (file != slot->gmem.file) {
 		WARN_ON_ONCE(slot->gmem.file);
@@ -803,6 +842,17 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file,
 		return ERR_PTR(-EIO);
 	}
 
+	/*
+	 * The folio can be mapped with a hugepage if and only if the folio is
+	 * fully contained by the range the memslot is bound to.  Note, the
+	 * caller is responsible for handling gfn alignment, this only deals
+	 * with the file binding.
+	 */
+	huge_index = ALIGN_DOWN(index, 1ull << *max_order);
+	if (huge_index < slot->gmem.pgoff ||
+	    huge_index + (1ull << *max_order) > slot->gmem.pgoff + slot->npages)
+		*max_order = 0;
+
 	folio = kvm_gmem_get_folio(file_inode(file), index);
 	if (IS_ERR(folio))
 		return folio;
@@ -814,8 +864,7 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file,
 	}
 
 	*pfn = folio_file_pfn(folio, index);
-	if (max_order)
-		*max_order = 0;
+	*max_order = min_t(int, *max_order, folio_order(folio));
 
 	return folio;
 }
@@ -910,17 +959,6 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
 			break;
 		}
 
-		/*
-		 * The max order shouldn't extend beyond the GFN range being
-		 * populated in this iteration, so set max_order accordingly.
-		 * __kvm_gmem_get_pfn() will then further adjust the order to
-		 * one that is contained by the backing memslot/folio.
-		 */
-		max_order = 0;
-		while (IS_ALIGNED(gfn, 1 << (max_order + 1)) &&
-		       (npages - i >= (1 << (max_order + 1))))
-			max_order++;
-
 		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &max_order);
 		if (IS_ERR(folio)) {
 			ret = PTR_ERR(folio);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5901d03e372c..525d136ba235 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -94,6 +94,10 @@ unsigned int halt_poll_ns_shrink = 2;
 module_param(halt_poll_ns_shrink, uint, 0644);
 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
 
+unsigned int gmem_2m_enabled;
+EXPORT_SYMBOL_GPL(gmem_2m_enabled);
+module_param(gmem_2m_enabled, uint, 0644);
+
 /*
  * Allow direct access (from KVM or the CPU) without MMU notifier protection
  * to unpinned pages.
-- 
2.25.1