Before transitioning a guest_memfd folio to unshared, thereby disallowing access by the host and allowing the hypervisor to transition its view of the guest page as private, we need to be sure that the host doesn't have any references to the folio. This patch introduces a new type for guest_memfd folios, and uses that to register a callback that informs the guest_memfd subsystem when the last reference is dropped, therefore knowing that the host doesn't have any remaining references. Signed-off-by: Fuad Tabba <tabba@xxxxxxxxxx> --- The function kvm_slot_gmem_register_callback() isn't used in this series. It will be used later in code that performs unsharing of memory. I have tested it with pKVM, based on downstream code [*]. It's included in this RFC since it demonstrates the plan to handle unsharing of private folios. [*] https://android-kvm.googlesource.com/linux/+/refs/heads/tabba/guestmem-6.13-v4-pkvm --- include/linux/kvm_host.h | 11 +++ include/linux/page-flags.h | 7 ++ mm/debug.c | 1 + mm/swap.c | 4 + virt/kvm/guest_memfd.c | 145 +++++++++++++++++++++++++++++++++++++ 5 files changed, 168 insertions(+) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 84aa7908a5dd..7ada5f78ded4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2574,6 +2574,8 @@ int kvm_slot_gmem_clear_mappable(struct kvm_memory_slot *slot, gfn_t start, gfn_t end); bool kvm_slot_gmem_is_mappable(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_slot_gmem_is_guest_mappable(struct kvm_memory_slot *slot, gfn_t gfn); +int kvm_slot_gmem_register_callback(struct kvm_memory_slot *slot, gfn_t gfn); +void kvm_gmem_handle_folio_put(struct folio *folio); #else static inline bool kvm_gmem_is_mappable(struct kvm *kvm, gfn_t gfn, gfn_t end) { @@ -2615,6 +2617,15 @@ static inline bool kvm_slot_gmem_is_guest_mappable(struct kvm_memory_slot *slot, WARN_ON_ONCE(1); return false; } +int kvm_slot_gmem_register_callback(struct kvm_memory_slot *slot, gfn_t gfn) +{ + WARN_ON_ONCE(1); + return -EINVAL; +} +static inline void kvm_gmem_handle_folio_put(struct folio *folio) +{ + WARN_ON_ONCE(1); +} #endif /* CONFIG_KVM_GMEM_MAPPABLE */ #endif diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index aca57802d7c7..b0e8e43de77c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -950,6 +950,7 @@ enum pagetype { PGTY_slab = 0xf5, PGTY_zsmalloc = 0xf6, PGTY_unaccepted = 0xf7, + PGTY_guestmem = 0xf8, PGTY_mapcount_underflow = 0xff }; @@ -1099,6 +1100,12 @@ FOLIO_TYPE_OPS(hugetlb, hugetlb) FOLIO_TEST_FLAG_FALSE(hugetlb) #endif +#ifdef CONFIG_KVM_GMEM_MAPPABLE +FOLIO_TYPE_OPS(guestmem, guestmem) +#else +FOLIO_TEST_FLAG_FALSE(guestmem) +#endif + PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) /* diff --git a/mm/debug.c b/mm/debug.c index 95b6ab809c0e..db93be385ed9 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -56,6 +56,7 @@ static const char *page_type_names[] = { DEF_PAGETYPE_NAME(table), DEF_PAGETYPE_NAME(buddy), DEF_PAGETYPE_NAME(unaccepted), + DEF_PAGETYPE_NAME(guestmem), }; static const char *page_type_name(unsigned int page_type) diff --git a/mm/swap.c b/mm/swap.c index 6f01b56bce13..15220eaabc86 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -37,6 +37,7 @@ #include <linux/page_idle.h> #include <linux/local_lock.h> #include <linux/buffer_head.h> +#include <linux/kvm_host.h> #include "internal.h" @@ -103,6 +104,9 @@ static void free_typed_folio(struct folio *folio) case PGTY_offline: /* Nothing to do, it's offline. */ return; + case PGTY_guestmem: + kvm_gmem_handle_folio_put(folio); + return; default: WARN_ON_ONCE(1); } diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index d1c192927cf7..5ecaa5dfcd00 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -387,6 +387,28 @@ enum folio_mappability { KVM_GMEM_NONE_MAPPABLE = 0b11, /* Not mappable, transient state. */ }; +/* + * Unregisters the __folio_put() callback from the folio. + * + * Restores a folio's refcount after all pending references have been released, + * and removes the folio type, thereby removing the callback. Now the folio can + * be freed normaly once all actual references have been dropped. + * + * Must be called with the filemap (inode->i_mapping) invalidate_lock held. + * Must also have exclusive access to the folio: folio must be either locked, or + * gmem holds the only reference. + */ +static void __kvm_gmem_restore_pending_folio(struct folio *folio) +{ + if (WARN_ON_ONCE(folio_mapped(folio) || !folio_test_guestmem(folio))) + return; + + WARN_ON_ONCE(!folio_test_locked(folio) || folio_ref_count(folio) > 1); + + __folio_clear_guestmem(folio); + folio_ref_add(folio, folio_nr_pages(folio)); +} + /* * Marks the range [start, end) as mappable by both the host and the guest. * Usually called when guest shares memory with the host. @@ -400,7 +422,31 @@ static int gmem_set_mappable(struct inode *inode, pgoff_t start, pgoff_t end) filemap_invalidate_lock(inode->i_mapping); for (i = start; i < end; i++) { + struct folio *folio = NULL; + + /* + * If the folio is NONE_MAPPABLE, it indicates that it is + * transitioning to private (GUEST_MAPPABLE). Transition it to + * shared (ALL_MAPPABLE) immediately, and remove the callback. + */ + if (xa_to_value(xa_load(mappable_offsets, i)) == KVM_GMEM_NONE_MAPPABLE) { + folio = filemap_lock_folio(inode->i_mapping, i); + if (WARN_ON_ONCE(IS_ERR(folio))) { + r = PTR_ERR(folio); + break; + } + + if (folio_test_guestmem(folio)) + __kvm_gmem_restore_pending_folio(folio); + } + r = xa_err(xa_store(mappable_offsets, i, xval, GFP_KERNEL)); + + if (folio) { + folio_unlock(folio); + folio_put(folio); + } + if (r) break; } @@ -473,6 +519,105 @@ static int gmem_clear_mappable(struct inode *inode, pgoff_t start, pgoff_t end) return r; } +/* + * Registers a callback to __folio_put(), so that gmem knows that the host does + * not have any references to the folio. It does that by setting the folio type + * to guestmem. + * + * Returns 0 if the host doesn't have any references, or -EAGAIN if the host + * has references, and the callback has been registered. + * + * Must be called with the following locks held: + * - filemap (inode->i_mapping) invalidate_lock + * - folio lock + */ +static int __gmem_register_callback(struct folio *folio, struct inode *inode, pgoff_t idx) +{ + struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets; + void *xval_guest = xa_mk_value(KVM_GMEM_GUEST_MAPPABLE); + int refcount; + + rwsem_assert_held_write_nolockdep(&inode->i_mapping->invalidate_lock); + WARN_ON_ONCE(!folio_test_locked(folio)); + + if (folio_mapped(folio) || folio_test_guestmem(folio)) + return -EAGAIN; + + /* Register a callback first. */ + __folio_set_guestmem(folio); + + /* + * Check for references after setting the type to guestmem, to guard + * against potential races with the refcount being decremented later. + * + * At least one reference is expected because the folio is locked. + */ + + refcount = folio_ref_sub_return(folio, folio_nr_pages(folio)); + if (refcount == 1) { + int r; + + /* refcount isn't elevated, it's now faultable by the guest. */ + r = WARN_ON_ONCE(xa_err(xa_store(mappable_offsets, idx, xval_guest, GFP_KERNEL))); + if (!r) + __kvm_gmem_restore_pending_folio(folio); + + return r; + } + + return -EAGAIN; +} + +int kvm_slot_gmem_register_callback(struct kvm_memory_slot *slot, gfn_t gfn) +{ + unsigned long pgoff = slot->gmem.pgoff + gfn - slot->base_gfn; + struct inode *inode = file_inode(slot->gmem.file); + struct folio *folio; + int r; + + filemap_invalidate_lock(inode->i_mapping); + + folio = filemap_lock_folio(inode->i_mapping, pgoff); + if (WARN_ON_ONCE(IS_ERR(folio))) { + r = PTR_ERR(folio); + goto out; + } + + r = __gmem_register_callback(folio, inode, pgoff); + + folio_unlock(folio); + folio_put(folio); +out: + filemap_invalidate_unlock(inode->i_mapping); + + return r; +} + +/* + * Callback function for __folio_put(), i.e., called when all references by the + * host to the folio have been dropped. This allows gmem to transition the state + * of the folio to mappable by the guest, and allows the hypervisor to continue + * transitioning its state to private, since the host cannot attempt to access + * it anymore. + */ +void kvm_gmem_handle_folio_put(struct folio *folio) +{ + struct xarray *mappable_offsets; + struct inode *inode; + pgoff_t index; + void *xval; + + inode = folio->mapping->host; + index = folio->index; + mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets; + xval = xa_mk_value(KVM_GMEM_GUEST_MAPPABLE); + + filemap_invalidate_lock(inode->i_mapping); + __kvm_gmem_restore_pending_folio(folio); + WARN_ON_ONCE(xa_err(xa_store(mappable_offsets, index, xval, GFP_KERNEL))); + filemap_invalidate_unlock(inode->i_mapping); +} + static bool gmem_is_mappable(struct inode *inode, pgoff_t pgoff) { struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets; -- 2.47.1.613.gc27f4b7a9f-goog