On Wed, Nov 20, 2024 at 10:12:08AM -0800, Elliot Berman wrote: > diff --git a/mm/guestmem.c b/mm/guestmem.c > new file mode 100644 > index 0000000000000000000000000000000000000000..19dd7e5d498f07577ec5cec5b52055f7435980f4 > --- /dev/null > +++ b/mm/guestmem.c > @@ -0,0 +1,196 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * guestmem library > + * > + * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. > + */ > + > +#include <linux/fs.h> > +#include <linux/guestmem.h> > +#include <linux/mm.h> > +#include <linux/pagemap.h> > + > +struct guestmem { > + const struct guestmem_ops *ops; > +}; > + > +static inline struct guestmem *folio_to_guestmem(struct folio *folio) > +{ > + struct address_space *mapping = folio->mapping; > + > + return mapping->i_private_data; > +} > + > +static inline bool __guestmem_release_folio(struct address_space *mapping, > + struct folio *folio) > +{ > + struct guestmem *gmem = mapping->i_private_data; > + struct list_head *entry; > + > + if (gmem->ops->release_folio) { > + list_for_each(entry, &mapping->i_private_list) { > + if (!gmem->ops->release_folio(entry, folio)) > + return false; > + } > + } > + > + return true; > +} > + > +static inline int > +__guestmem_invalidate_begin(struct address_space *const mapping, pgoff_t start, > + pgoff_t end) > +{ > + struct guestmem *gmem = mapping->i_private_data; > + struct list_head *entry; > + int ret = 0; > + > + list_for_each(entry, &mapping->i_private_list) { > + ret = gmem->ops->invalidate_begin(entry, start, end); > + if (ret) > + return ret; > + } > + > + return 0; > +} > + > +static inline void > +__guestmem_invalidate_end(struct address_space *const mapping, pgoff_t start, > + pgoff_t end) > +{ > + struct guestmem *gmem = mapping->i_private_data; > + struct list_head *entry; > + > + if (gmem->ops->invalidate_end) { > + list_for_each(entry, &mapping->i_private_list) > + gmem->ops->invalidate_end(entry, start, end); > + } > +} > + > +static void guestmem_free_folio(struct address_space *mapping, > + struct folio *folio) > +{ > + WARN_ON_ONCE(!__guestmem_release_folio(mapping, folio)); > +} > + > +static int guestmem_error_folio(struct address_space *mapping, > + struct folio *folio) > +{ > + pgoff_t start, end; > + int ret; > + > + filemap_invalidate_lock_shared(mapping); > + > + start = folio->index; > + end = start + folio_nr_pages(folio); > + > + ret = __guestmem_invalidate_begin(mapping, start, end); > + if (ret) > + goto out; > + > + /* > + * Do not truncate the range, what action is taken in response to the > + * error is userspace's decision (assuming the architecture supports > + * gracefully handling memory errors). If/when the guest attempts to > + * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON, > + * at which point KVM can either terminate the VM or propagate the > + * error to userspace. > + */ > + > + __guestmem_invalidate_end(mapping, start, end); > + > +out: > + filemap_invalidate_unlock_shared(mapping); > + return ret ? MF_DELAYED : MF_FAILED; > +} > + > +static int guestmem_migrate_folio(struct address_space *mapping, > + struct folio *dst, struct folio *src, > + enum migrate_mode mode) > +{ > + WARN_ON_ONCE(1); > + return -EINVAL; > +} > + > +static const struct address_space_operations guestmem_aops = { > + .dirty_folio = noop_dirty_folio, > + .free_folio = guestmem_free_folio, > + .error_remove_folio = guestmem_error_folio, > + .migrate_folio = guestmem_migrate_folio, > +}; > + > +int guestmem_attach_mapping(struct address_space *mapping, > + const struct guestmem_ops *const ops, > + struct list_head *data) > +{ > + struct guestmem *gmem; > + > + if (mapping->a_ops == &guestmem_aops) { > + gmem = mapping->i_private_data; > + if (gmem->ops != ops) > + return -EINVAL; > + > + goto add; > + } > + > + gmem = kzalloc(sizeof(*gmem), GFP_KERNEL); > + if (!gmem) > + return -ENOMEM; > + > + gmem->ops = ops; > + > + mapping->a_ops = &guestmem_aops; > + mapping->i_private_data = gmem; > + > + mapping_set_gfp_mask(mapping, GFP_HIGHUSER); > + mapping_set_inaccessible(mapping); > + /* Unmovable mappings are supposed to be marked unevictable as well. */ > + WARN_ON_ONCE(!mapping_unevictable(mapping)); > + > +add: > + list_add(data, &mapping->i_private_list); > + return 0; > +} > +EXPORT_SYMBOL_GPL(guestmem_attach_mapping); > + > +void guestmem_detach_mapping(struct address_space *mapping, > + struct list_head *data) > +{ > + list_del(data); > + > + if (list_empty(&mapping->i_private_list)) { > + kfree(mapping->i_private_data); Mike was helping me test this out for SEV-SNP. They helped find a bug here. Right now, when the file closes, KVM calls guestmem_detach_mapping() which will uninstall the ops. When that happens, it's not necessary that all of the folios aren't removed from the filemap yet and so our free_folio() callback isn't invoked. This means that we skip updating the RMP entry back to shared/KVM-owned. There are a few approaches I could take: 1. Create a guestmem superblock so I can register guestmem-specific destroy_inode() to do the kfree() above. This requires a lot of boilerplate code, and I think it's not preferred approach. 2. Update how KVM tracks the memory so it is back in "shared" state when the file closes. This requires some significant rework about the page state compared to current guest_memfd. That rework might be useful for the shared/private state machine. 3. Call truncate_inode_pages(mapping, 0) to force pages to be freed here. It's might be possible that a page is allocated after this point. In order for that to be a problem, KVM would need to update RMP entry as guest-owned, and I don't believe that's possible after the last guestmem_detach_mapping(). My preference is to go with #3 as it was the most easy thing to do. > + mapping->i_private_data = NULL; > + mapping->a_ops = &empty_aops; > + } > +} > +EXPORT_SYMBOL_GPL(guestmem_detach_mapping); > + > +struct folio *guestmem_grab_folio(struct address_space *mapping, pgoff_t index) > +{ > + /* TODO: Support huge pages. */ > + return filemap_grab_folio(mapping, index); > +} > +EXPORT_SYMBOL_GPL(guestmem_grab_folio); > + > +int guestmem_punch_hole(struct address_space *mapping, loff_t offset, > + loff_t len) > +{ > + pgoff_t start = offset >> PAGE_SHIFT; > + pgoff_t end = (offset + len) >> PAGE_SHIFT; > + int ret; > + > + filemap_invalidate_lock(mapping); > + ret = __guestmem_invalidate_begin(mapping, start, end); > + if (ret) > + goto out; > + > + truncate_inode_pages_range(mapping, offset, offset + len - 1); > + > + __guestmem_invalidate_end(mapping, start, end); > + > +out: > + filemap_invalidate_unlock(mapping); > + return ret; > +} > +EXPORT_SYMBOL_GPL(guestmem_punch_hole); > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig > index fd6a3010afa833e077623065b80bdbb5b1012250..1339098795d2e859b2ee0ef419b29045aedc8487 100644 > --- a/virt/kvm/Kconfig > +++ b/virt/kvm/Kconfig > @@ -106,6 +106,7 @@ config KVM_GENERIC_MEMORY_ATTRIBUTES > > config KVM_PRIVATE_MEM > select XARRAY_MULTI > + select GUESTMEM > bool > > config KVM_GENERIC_PRIVATE_MEM > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > index 24dcbad0cb76e353509cf4718837a1999f093414..edf57d5662cb8634bbd9ca3118b293c4f7ca229a 100644 > --- a/virt/kvm/guest_memfd.c > +++ b/virt/kvm/guest_memfd.c > @@ -1,6 +1,7 @@ > // SPDX-License-Identifier: GPL-2.0 > #include <linux/backing-dev.h> > #include <linux/falloc.h> > +#include <linux/guestmem.h> > #include <linux/kvm_host.h> > #include <linux/pagemap.h> > #include <linux/anon_inodes.h> > @@ -98,8 +99,7 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, > */ > static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) > { > - /* TODO: Support huge pages. */ > - return filemap_grab_folio(inode->i_mapping, index); > + return guestmem_grab_folio(inode->i_mapping, index); > } > > static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, > @@ -151,28 +151,7 @@ static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, > > static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) > { > - struct list_head *gmem_list = &inode->i_mapping->i_private_list; > - pgoff_t start = offset >> PAGE_SHIFT; > - pgoff_t end = (offset + len) >> PAGE_SHIFT; > - struct kvm_gmem *gmem; > - > - /* > - * Bindings must be stable across invalidation to ensure the start+end > - * are balanced. > - */ > - filemap_invalidate_lock(inode->i_mapping); > - > - list_for_each_entry(gmem, gmem_list, entry) > - kvm_gmem_invalidate_begin(gmem, start, end); > - > - truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); > - > - list_for_each_entry(gmem, gmem_list, entry) > - kvm_gmem_invalidate_end(gmem, start, end); > - > - filemap_invalidate_unlock(inode->i_mapping); > - > - return 0; > + return guestmem_punch_hole(inode->i_mapping, offset, len); > } > > static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) > @@ -277,7 +256,7 @@ static int kvm_gmem_release(struct inode *inode, struct file *file) > kvm_gmem_invalidate_begin(gmem, 0, -1ul); > kvm_gmem_invalidate_end(gmem, 0, -1ul); > > - list_del(&gmem->entry); > + guestmem_detach_mapping(inode->i_mapping, &gmem->entry); > > filemap_invalidate_unlock(inode->i_mapping); > > @@ -318,63 +297,42 @@ void kvm_gmem_init(struct module *module) > kvm_gmem_fops.owner = module; > } > > -static int kvm_gmem_migrate_folio(struct address_space *mapping, > - struct folio *dst, struct folio *src, > - enum migrate_mode mode) > +static int kvm_guestmem_invalidate_begin(struct list_head *entry, pgoff_t start, > + pgoff_t end) > { > - WARN_ON_ONCE(1); > - return -EINVAL; > -} > + struct kvm_gmem *gmem = container_of(entry, struct kvm_gmem, entry); > > -static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio) > -{ > - struct list_head *gmem_list = &mapping->i_private_list; > - struct kvm_gmem *gmem; > - pgoff_t start, end; > - > - filemap_invalidate_lock_shared(mapping); > - > - start = folio->index; > - end = start + folio_nr_pages(folio); > - > - list_for_each_entry(gmem, gmem_list, entry) > - kvm_gmem_invalidate_begin(gmem, start, end); > - > - /* > - * Do not truncate the range, what action is taken in response to the > - * error is userspace's decision (assuming the architecture supports > - * gracefully handling memory errors). If/when the guest attempts to > - * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON, > - * at which point KVM can either terminate the VM or propagate the > - * error to userspace. > - */ > + kvm_gmem_invalidate_begin(gmem, start, end); > > - list_for_each_entry(gmem, gmem_list, entry) > - kvm_gmem_invalidate_end(gmem, start, end); > + return 0; > +} > > - filemap_invalidate_unlock_shared(mapping); > +static void kvm_guestmem_invalidate_end(struct list_head *entry, pgoff_t start, > + pgoff_t end) > +{ > + struct kvm_gmem *gmem = container_of(entry, struct kvm_gmem, entry); > > - return MF_DELAYED; > + kvm_gmem_invalidate_end(gmem, start, end); > } > > #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE > -static void kvm_gmem_free_folio(struct address_space *mapping, > - struct folio *folio) > +static bool kvm_gmem_release_folio(struct list_head *entry, struct folio *folio) > { > struct page *page = folio_page(folio, 0); > kvm_pfn_t pfn = page_to_pfn(page); > int order = folio_order(folio); > > kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); > + > + return true; > } > #endif > > -static const struct address_space_operations kvm_gmem_aops = { > - .dirty_folio = noop_dirty_folio, > - .migrate_folio = kvm_gmem_migrate_folio, > - .error_remove_folio = kvm_gmem_error_folio, > +static const struct guestmem_ops kvm_guestmem_ops = { > + .invalidate_begin = kvm_guestmem_invalidate_begin, > + .invalidate_end = kvm_guestmem_invalidate_end, > #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE > - .free_folio = kvm_gmem_free_folio, > + .release_folio = kvm_gmem_release_folio, > #endif > }; > > @@ -430,22 +388,22 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) > > inode->i_private = (void *)(unsigned long)flags; > inode->i_op = &kvm_gmem_iops; > - inode->i_mapping->a_ops = &kvm_gmem_aops; > inode->i_mode |= S_IFREG; > inode->i_size = size; > - mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); > - mapping_set_inaccessible(inode->i_mapping); > - /* Unmovable mappings are supposed to be marked unevictable as well. */ > - WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); > + err = guestmem_attach_mapping(inode->i_mapping, &kvm_guestmem_ops, > + &gmem->entry); > + if (err) > + goto err_putfile; > > kvm_get_kvm(kvm); > gmem->kvm = kvm; > xa_init(&gmem->bindings); > - list_add(&gmem->entry, &inode->i_mapping->i_private_list); > > fd_install(fd, file); > return fd; > > +err_putfile: > + fput(file); > err_gmem: > kfree(gmem); > err_fd: > > -- > 2.34.1 >