From: Sean Christopherson <seanjc@xxxxxxxxxx> Extended guest_memfd to allow backing guest memory with hugepages. This is done as a best-effort by default until a better-defined mechanism is put in place that can provide better control/assurances to userspace about hugepage allocations. When reporting the max order when KVM gets a pfn from guest_memfd, force order-0 pages if the hugepage is not fully contained by the memslot binding, e.g. if userspace requested hugepages but punches a hole in the memslot bindings in order to emulate x86's VGA hole. Link: https://lore.kernel.org/kvm/20231027182217.3615211-1-seanjc@xxxxxxxxxx/T/#mccbd3e8bf9897f0ddbf864e6318d6f2f208b269c Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx> Message-Id: <20231027182217.3615211-18-seanjc@xxxxxxxxxx> [Allow even with CONFIG_TRANSPARENT_HUGEPAGE; dropped momentarily due to uneasiness about the API. - Paolo] Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> [mdr: based on discussion in the Link regarding original patch, make the following set of changes: - For now, don't introduce an opt-in flag to enable hugepage support. By default, just make a best-effort for PMD_ORDER allocations so that there are no false assurances to userspace that they'll get hugepages. It's better at least than the current guarantee that they will get 4K pages every time. A more proper opt-in interface can then improve on things later. - Pass GFP_NOWARN to alloc_pages() so failures are not disruptive to normal operations - Drop size checks during creation time. Instead just avoid huge allocations if they extend beyond end of the memfd. - Drop hugepage-related unit tests since everything is now handled transparently to userspace anyway. - Update commit message accordingly.] Signed-off-by: Michael Roth <michael.roth@xxxxxxx> Signed-off-by: Michael Roth <michael.roth@xxxxxxx> --- virt/kvm/guest_memfd.c | 63 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 7 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 4aa23b01aa98..784690a664ac 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -13,14 +13,46 @@ struct kvm_gmem { struct list_head entry; }; -static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) +static struct folio *kvm_gmem_get_huge_folio(struct inode *inode, pgoff_t index, + unsigned int order) { + pgoff_t npages = 1UL << order; + pgoff_t huge_index = round_down(index, npages); + struct address_space *mapping = inode->i_mapping; + gfp_t gfp = mapping_gfp_mask(mapping) | __GFP_NOWARN; + loff_t size = i_size_read(inode); struct folio *folio; - /* TODO: Support huge pages. */ - folio = filemap_grab_folio(inode->i_mapping, index); - if (IS_ERR_OR_NULL(folio)) + /* Make sure hugepages would be fully-contained by inode */ + if ((huge_index + npages) * PAGE_SIZE > size) + return NULL; + + if (filemap_range_has_page(mapping, (loff_t)huge_index << PAGE_SHIFT, + (loff_t)(huge_index + npages - 1) << PAGE_SHIFT)) + return NULL; + + folio = filemap_alloc_folio(gfp, order); + if (!folio) + return NULL; + + if (filemap_add_folio(mapping, folio, huge_index, gfp)) { + folio_put(folio); return NULL; + } + + return folio; +} + +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) +{ + struct folio *folio; + + folio = kvm_gmem_get_huge_folio(inode, index, PMD_ORDER); + if (!folio) { + folio = filemap_grab_folio(inode->i_mapping, index); + if (IS_ERR_OR_NULL(folio)) + return NULL; + } /* * Use the up-to-date flag to track whether or not the memory has been @@ -361,6 +393,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) inode->i_mode |= S_IFREG; inode->i_size = size; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); + mapping_set_large_folios(inode->i_mapping); mapping_set_unmovable(inode->i_mapping); /* Unmovable mappings are supposed to be marked unevictable as well. */ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); @@ -486,7 +519,7 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, int *max_order) { - pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; + pgoff_t index, huge_index; struct kvm_gmem *gmem; struct folio *folio; struct page *page; @@ -499,6 +532,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gmem = file->private_data; + index = gfn - slot->base_gfn + slot->gmem.pgoff; if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) { r = -EIO; goto out_fput; @@ -518,9 +552,24 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, page = folio_file_page(folio, index); *pfn = page_to_pfn(page); - if (max_order) - *max_order = 0; + if (!max_order) + goto success; + + *max_order = compound_order(compound_head(page)); + if (!*max_order) + goto success; + /* + * The folio can be mapped with a hugepage if and only if the folio is + * fully contained by the range the memslot is bound to. Note, the + * caller is responsible for handling gfn alignment, this only deals + * with the file binding. + */ + huge_index = ALIGN(index, 1ull << *max_order); + if (huge_index < ALIGN(slot->gmem.pgoff, 1ull << *max_order) || + huge_index + (1ull << *max_order) > slot->gmem.pgoff + slot->npages) + *max_order = 0; +success: r = 0; out_unlock: -- 2.25.1