From: Sean Christopherson <seanjc@xxxxxxxxxx> Extended guest_memfd to allow backing guest memory with hugepages. This is done as a best-effort by default until a better-defined mechanism is put in place that can provide better control/assurances to userspace about hugepage allocations. When reporting the max order when KVM gets a pfn from guest_memfd, force order-0 pages if the hugepage is not fully contained by the memslot binding, e.g. if userspace requested hugepages but punches a hole in the memslot bindings in order to emulate x86's VGA hole. Link: https://lore.kernel.org/kvm/20231027182217.3615211-1-seanjc@xxxxxxxxxx/T/#mccbd3e8bf9897f0ddbf864e6318d6f2f208b269c Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx> Message-Id: <20231027182217.3615211-18-seanjc@xxxxxxxxxx> [Allow even with CONFIG_TRANSPARENT_HUGEPAGE; dropped momentarily due to uneasiness about the API. - Paolo] Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> [mdr: based on discussion in the Link regarding original patch, make the following set of changes: - For now, don't introduce an opt-in flag to enable hugepage support. By default, just make a best-effort for PMD_ORDER allocations so that there are no false assurances to userspace that they'll get hugepages. Performance-wise, it's better at least than the current guarantee that they will get 4K pages every time. A more proper opt-in interface can then improve on things later. - Pass GFP_NOWARN to alloc_pages() so failures are not disruptive to normal operations - Drop size checks during creation time. Instead just avoid huge allocations if they extend beyond end of the memfd. - Drop hugepage-related unit tests since everything is now handled transparently to userspace anyway. - Update commit message accordingly.] Signed-off-by: Michael Roth <michael.roth@xxxxxxx> --- include/linux/kvm_host.h | 2 ++ virt/kvm/guest_memfd.c | 68 +++++++++++++++++++++++++++++++--------- virt/kvm/kvm_main.c | 4 +++ 3 files changed, 59 insertions(+), 15 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c7e4f8be3e17..c946ec98d614 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2278,6 +2278,8 @@ extern unsigned int halt_poll_ns_grow; extern unsigned int halt_poll_ns_grow_start; extern unsigned int halt_poll_ns_shrink; +extern unsigned int gmem_2m_enabled; + struct kvm_device { const struct kvm_device_ops *ops; struct kvm *kvm; diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 9a5172de6a03..d0caec99fe03 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -273,6 +273,36 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct file *file, return r; } +static struct folio *kvm_gmem_get_huge_folio(struct inode *inode, pgoff_t index, + unsigned int order) +{ + pgoff_t npages = 1UL << order; + pgoff_t huge_index = round_down(index, npages); + struct address_space *mapping = inode->i_mapping; + gfp_t gfp = mapping_gfp_mask(mapping) | __GFP_NOWARN; + loff_t size = i_size_read(inode); + struct folio *folio; + + /* Make sure hugepages would be fully-contained by inode */ + if ((huge_index + npages) * PAGE_SIZE > size) + return NULL; + + if (filemap_range_has_page(mapping, (loff_t)huge_index << PAGE_SHIFT, + (loff_t)(huge_index + npages - 1) << PAGE_SHIFT)) + return NULL; + + folio = filemap_alloc_folio(gfp, order); + if (!folio) + return NULL; + + if (filemap_add_folio(mapping, folio, huge_index, gfp)) { + folio_put(folio); + return NULL; + } + + return folio; +} + /* * Returns a locked folio on success. The caller is responsible for * setting the up-to-date flag before the memory is mapped into the guest. @@ -284,8 +314,15 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct file *file, */ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) { - /* TODO: Support huge pages. */ - return filemap_grab_folio(inode->i_mapping, index); + struct folio *folio = NULL; + + if (gmem_2m_enabled) + folio = kvm_gmem_get_huge_folio(inode, index, PMD_ORDER); + + if (!folio) + folio = filemap_grab_folio(inode->i_mapping, index); + + return folio; } static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, @@ -660,6 +697,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) inode->i_size = size; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_inaccessible(inode->i_mapping); + mapping_set_large_folios(inode->i_mapping); /* Unmovable mappings are supposed to be marked unevictable as well. */ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); @@ -791,6 +829,7 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file, { struct kvm_gmem *gmem = file->private_data; struct folio *folio; + pgoff_t huge_index; if (file != slot->gmem.file) { WARN_ON_ONCE(slot->gmem.file); @@ -803,6 +842,17 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file, return ERR_PTR(-EIO); } + /* + * The folio can be mapped with a hugepage if and only if the folio is + * fully contained by the range the memslot is bound to. Note, the + * caller is responsible for handling gfn alignment, this only deals + * with the file binding. + */ + huge_index = ALIGN_DOWN(index, 1ull << *max_order); + if (huge_index < slot->gmem.pgoff || + huge_index + (1ull << *max_order) > slot->gmem.pgoff + slot->npages) + *max_order = 0; + folio = kvm_gmem_get_folio(file_inode(file), index); if (IS_ERR(folio)) return folio; @@ -814,8 +864,7 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file, } *pfn = folio_file_pfn(folio, index); - if (max_order) - *max_order = 0; + *max_order = min_t(int, *max_order, folio_order(folio)); return folio; } @@ -910,17 +959,6 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long break; } - /* - * The max order shouldn't extend beyond the GFN range being - * populated in this iteration, so set max_order accordingly. - * __kvm_gmem_get_pfn() will then further adjust the order to - * one that is contained by the backing memslot/folio. - */ - max_order = 0; - while (IS_ALIGNED(gfn, 1 << (max_order + 1)) && - (npages - i >= (1 << (max_order + 1)))) - max_order++; - folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &max_order); if (IS_ERR(folio)) { ret = PTR_ERR(folio); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5901d03e372c..525d136ba235 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -94,6 +94,10 @@ unsigned int halt_poll_ns_shrink = 2; module_param(halt_poll_ns_shrink, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); +unsigned int gmem_2m_enabled; +EXPORT_SYMBOL_GPL(gmem_2m_enabled); +module_param(gmem_2m_enabled, uint, 0644); + /* * Allow direct access (from KVM or the CPU) without MMU notifier protection * to unpinned pages. -- 2.25.1