On Fri, Apr 26, 2024, Paolo Bonzini wrote: > On Thu, Apr 25, 2024 at 6:00 PM Sean Christopherson <seanjc@xxxxxxxxxx> wrote: > > > > On Thu, Apr 25, 2024, Paolo Bonzini wrote: > > > On Thu, Apr 25, 2024 at 3:12 AM Isaku Yamahata <isaku.yamahata@xxxxxxxxx> wrote: > > > > > > get_user_pages_fast(source addr) > > > > > > read_lock(mmu_lock) > > > > > > kvm_tdp_mmu_get_walk_private_pfn(vcpu, gpa, &pfn); > > > > > > if the page table doesn't map gpa, error. > > > > > > TDH.MEM.PAGE.ADD() > > > > > > TDH.MR.EXTEND() > > > > > > read_unlock(mmu_lock) > > > > > > put_page() > > > > > > > > > > Hmm, KVM doesn't _need_ to use invalidate_lock to protect against guest_memfd > > > > > invalidation, but I also don't see why it would cause problems. > > > > > > The invalidate_lock is only needed to operate on the guest_memfd, but > > > it's a rwsem so there are no risks of lock inversion. > > > > > > > > I.e. why not > > > > > take mmu_lock() in TDX's post_populate() implementation? > > > > > > > > We can take the lock. Because we have already populated the GFN of guest_memfd, > > > > we need to make kvm_gmem_populate() not pass FGP_CREAT_ONLY. Otherwise we'll > > > > get -EEXIST. > > > > > > I don't understand why TDH.MEM.PAGE.ADD() cannot be called from the > > > post-populate hook. Can the code for TDH.MEM.PAGE.ADD be shared > > > between the memory initialization ioctl and the page fault hook in > > > kvm_x86_ops? > > > > Ah, because TDX is required to pre-fault the memory to establish the S-EPT walk, > > and pre-faulting means guest_memfd() > > > > Requiring that guest_memfd not have a page when initializing the guest image > > seems wrong, i.e. I don't think we want FGP_CREAT_ONLY. And not just because I > > am a fan of pre-faulting, I think the semantics are bad. > > Ok, fair enough. I wanted to do the once-only test in common code but since > SEV code checks for the RMP I can remove that. One less headache. I definitely don't object to having a check in common code, and I'd be in favor of removing the RMP checks if possible, but tracking needs to be something more explicit in guest_memfd. *sigh* I even left behind a TODO for this exact thing, and y'all didn't even wave at it as you flew by :-) /* * Use the up-to-date flag to track whether or not the memory has been * zeroed before being handed off to the guest. There is no backing * storage for the memory, so the folio will remain up-to-date until * it's removed. * * TODO: Skip clearing pages when trusted firmware will do it when <========================== * assigning memory to the guest. */ if (!folio_test_uptodate(folio)) { unsigned long nr_pages = folio_nr_pages(folio); unsigned long i; for (i = 0; i < nr_pages; i++) clear_highpage(folio_page(folio, i)); folio_mark_uptodate(folio); } if (prepare) { int r = kvm_gmem_prepare_folio(inode, index, folio); if (r < 0) { folio_unlock(folio); folio_put(folio); return ERR_PTR(r); } } Compile tested only (and not even fully as I didn't bother defining CONFIG_HAVE_KVM_GMEM_INITIALIZE), but I think this is the basic gist. 8< -------------------------------- // SPDX-License-Identifier: GPL-2.0 #include <linux/backing-dev.h> #include <linux/falloc.h> #include <linux/kvm_host.h> #include <linux/pagemap.h> #include <linux/anon_inodes.h> #include "kvm_mm.h" struct kvm_gmem { struct kvm *kvm; struct xarray bindings; struct list_head entry; }; static int kvm_gmem_initialize_folio(struct kvm *kvm, struct folio *folio, pgoff_t index, void __user *src, void *opaque) { #ifdef CONFIG_HAVE_KVM_GMEM_INITIALIZE return kvm_arch_gmem_initialize(kvm, folio, index, src, opaque); #else unsigned long nr_pages = folio_nr_pages(folio); unsigned long i; if (WARN_ON_ONCE(src)) return -EIO; for (i = 0; i < nr_pages; i++) clear_highpage(folio_file_page(folio, index + i)); #endif return 0; } static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) { return gfn - slot->base_gfn + slot->gmem.pgoff; } static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) { /* * Do not return slot->gmem.file if it has already been closed; * there might be some time between the last fput() and when * kvm_gmem_release() clears slot->gmem.file, and you do not * want to spin in the meanwhile. */ return get_file_active(&slot->gmem.file); } static struct folio *__kvm_gmem_get_folio(struct inode *inode, pgoff_t index) { fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; struct folio *folio; /* * The caller is responsible for managing the up-to-date flag (or not), * as the memory doesn't need to be initialized until it's actually * mapped into the guest. Waiting to initialize memory is necessary * for VM types where the memory can be initialized exactly once. * * Ignore accessed, referenced, and dirty flags. The memory is * unevictable and there is no storage to write back to. * * TODO: Support huge pages. */ folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mapping_gfp_mask(inode->i_mapping)); if (folio_test_hwpoison(folio)) { folio_unlock(folio); return ERR_PTR(-EHWPOISON); } return folio; } static struct folio *kvm_gmem_get_folio(struct file *file, struct kvm_memory_slot *slot, gfn_t gfn) { pgoff_t index = kvm_gmem_get_index(slot, gfn); struct kvm_gmem *gmem = file->private_data; struct inode *inode; if (file != slot->gmem.file) { WARN_ON_ONCE(slot->gmem.file); return ERR_PTR(-EFAULT); } gmem = file->private_data; if (xa_load(&gmem->bindings, index) != slot) { WARN_ON_ONCE(xa_load(&gmem->bindings, index)); return ERR_PTR(-EIO); } inode = file_inode(file); /* * The caller is responsible for managing the up-to-date flag (or not), * as the memory doesn't need to be initialized until it's actually * mapped into the guest. Waiting to initialize memory is necessary * for VM types where the memory can be initialized exactly once. * * Ignore accessed, referenced, and dirty flags. The memory is * unevictable and there is no storage to write back to. * * TODO: Support huge pages. */ return __kvm_gmem_get_folio(inode, index); } int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, int *max_order) { pgoff_t index = kvm_gmem_get_index(slot, gfn); struct file *file = kvm_gmem_get_file(slot); struct folio *folio; struct page *page; if (!file) return -EFAULT; folio = kvm_gmem_get_folio(file, slot, gfn); if (IS_ERR(folio)) goto out; if (!folio_test_uptodate(folio)) { kvm_gmem_initialize_folio(kvm, folio, index, NULL, NULL); folio_mark_uptodate(folio); } page = folio_file_page(folio, index); *pfn = page_to_pfn(page); if (max_order) *max_order = 0; out: fput(file); return IS_ERR(folio) ? PTR_ERR(folio) : 0; } EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); long kvm_gmem_populate(struct kvm *kvm, gfn_t base_gfn, void __user *base_src, long npages, void *opaque) { struct kvm_memory_slot *slot; struct file *file; int ret = 0, max_order; long i; lockdep_assert_held(&kvm->slots_lock); if (npages < 0) return -EINVAL; slot = gfn_to_memslot(kvm, base_gfn); if (!kvm_slot_can_be_private(slot)) return -EINVAL; file = kvm_gmem_get_file(slot); if (!file) return -EFAULT; filemap_invalidate_lock(file->f_mapping); npages = min_t(ulong, slot->npages - (base_gfn - slot->base_gfn), npages); for (i = 0; i < npages; i += (1 << max_order)) { void __user *src = base_src + i * PAGE_SIZE; gfn_t gfn = base_gfn + i; pgoff_t index = kvm_gmem_get_index(slot, gfn); struct folio *folio; folio = kvm_gmem_get_folio(file, slot, gfn); if (IS_ERR(folio)) { ret = PTR_ERR(folio); break; } if (folio_test_uptodate(folio)) { folio_put(folio); ret = -EEXIST; break; } kvm_gmem_initialize_folio(kvm, folio, index, src, opaque); folio_unlock(folio); folio_put(folio); } filemap_invalidate_unlock(file->f_mapping); fput(file); return ret && !i ? ret : i; } EXPORT_SYMBOL_GPL(kvm_gmem_populate); static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, pgoff_t end) { bool flush = false, found_memslot = false; struct kvm_memory_slot *slot; struct kvm *kvm = gmem->kvm; unsigned long index; xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) { pgoff_t pgoff = slot->gmem.pgoff; struct kvm_gfn_range gfn_range = { .start = slot->base_gfn + max(pgoff, start) - pgoff, .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff, .slot = slot, .may_block = true, }; if (!found_memslot) { found_memslot = true; KVM_MMU_LOCK(kvm); kvm_mmu_invalidate_begin(kvm); } flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range); } if (flush) kvm_flush_remote_tlbs(kvm); if (found_memslot) KVM_MMU_UNLOCK(kvm); } static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, pgoff_t end) { struct kvm *kvm = gmem->kvm; if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { KVM_MMU_LOCK(kvm); kvm_mmu_invalidate_end(kvm); KVM_MMU_UNLOCK(kvm); } } static long __kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) { struct list_head *gmem_list = &inode->i_mapping->i_private_list; pgoff_t start = offset >> PAGE_SHIFT; pgoff_t end = (offset + len) >> PAGE_SHIFT; struct kvm_gmem *gmem; list_for_each_entry(gmem, gmem_list, entry) kvm_gmem_invalidate_begin(gmem, start, end); truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); list_for_each_entry(gmem, gmem_list, entry) kvm_gmem_invalidate_end(gmem, start, end); return 0; } static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) { int r; /* * Bindings must be stable across invalidation to ensure the start+end * are balanced. */ filemap_invalidate_lock(inode->i_mapping); r = __kvm_gmem_punch_hole(inode, offset, len); filemap_invalidate_unlock(inode->i_mapping); return r; } static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) { struct address_space *mapping = inode->i_mapping; pgoff_t start, index, end; int r; /* Dedicated guest is immutable by default. */ if (offset + len > i_size_read(inode)) return -EINVAL; filemap_invalidate_lock_shared(mapping); start = offset >> PAGE_SHIFT; end = (offset + len) >> PAGE_SHIFT; r = 0; for (index = start; index < end; ) { struct folio *folio; if (signal_pending(current)) { r = -EINTR; break; } folio = __kvm_gmem_get_folio(inode, index); if (IS_ERR(folio)) { r = PTR_ERR(folio); break; } index = folio_next_index(folio); folio_unlock(folio); folio_put(folio); /* 64-bit only, wrapping the index should be impossible. */ if (WARN_ON_ONCE(!index)) break; cond_resched(); } filemap_invalidate_unlock_shared(mapping); return r; } static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { int ret; if (!(mode & FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) return -EINVAL; if (mode & FALLOC_FL_PUNCH_HOLE) ret = kvm_gmem_punch_hole(file_inode(file), offset, len); else ret = kvm_gmem_allocate(file_inode(file), offset, len); if (!ret) file_modified(file); return ret; } static int kvm_gmem_release(struct inode *inode, struct file *file) { struct kvm_gmem *gmem = file->private_data; struct kvm_memory_slot *slot; struct kvm *kvm = gmem->kvm; unsigned long index; /* * Prevent concurrent attempts to *unbind* a memslot. This is the last * reference to the file and thus no new bindings can be created, but * dereferencing the slot for existing bindings needs to be protected * against memslot updates, specifically so that unbind doesn't race * and free the memslot (kvm_gmem_get_file() will return NULL). */ mutex_lock(&kvm->slots_lock); filemap_invalidate_lock(inode->i_mapping); xa_for_each(&gmem->bindings, index, slot) rcu_assign_pointer(slot->gmem.file, NULL); synchronize_rcu(); /* * All in-flight operations are gone and new bindings can be created. * Zap all SPTEs pointed at by this file. Do not free the backing * memory, as its lifetime is associated with the inode, not the file. */ kvm_gmem_invalidate_begin(gmem, 0, -1ul); kvm_gmem_invalidate_end(gmem, 0, -1ul); list_del(&gmem->entry); filemap_invalidate_unlock(inode->i_mapping); mutex_unlock(&kvm->slots_lock); xa_destroy(&gmem->bindings); kfree(gmem); kvm_put_kvm(kvm); return 0; } static struct file_operations kvm_gmem_fops = { .open = generic_file_open, .release = kvm_gmem_release, .fallocate = kvm_gmem_fallocate, }; void kvm_gmem_init(struct module *module) { kvm_gmem_fops.owner = module; } static int kvm_gmem_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { WARN_ON_ONCE(1); return -EINVAL; } static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio) { struct list_head *gmem_list = &mapping->i_private_list; struct kvm_gmem *gmem; pgoff_t start, end; filemap_invalidate_lock_shared(mapping); start = folio->index; end = start + folio_nr_pages(folio); list_for_each_entry(gmem, gmem_list, entry) kvm_gmem_invalidate_begin(gmem, start, end); /* * Do not truncate the range, what action is taken in response to the * error is userspace's decision (assuming the architecture supports * gracefully handling memory errors). If/when the guest attempts to * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON, * at which point KVM can either terminate the VM or propagate the * error to userspace. */ list_for_each_entry(gmem, gmem_list, entry) kvm_gmem_invalidate_end(gmem, start, end); filemap_invalidate_unlock_shared(mapping); return MF_DELAYED; } #ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE static void kvm_gmem_free_folio(struct folio *folio) { struct page *page = folio_page(folio, 0); kvm_pfn_t pfn = page_to_pfn(page); int order = folio_order(folio); kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); } #endif static const struct address_space_operations kvm_gmem_aops = { .dirty_folio = noop_dirty_folio, .migrate_folio = kvm_gmem_migrate_folio, .error_remove_folio = kvm_gmem_error_folio, #ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE .free_folio = kvm_gmem_free_folio, #endif }; static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = path->dentry->d_inode; generic_fillattr(idmap, request_mask, inode, stat); return 0; } static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { return -EINVAL; } static const struct inode_operations kvm_gmem_iops = { .getattr = kvm_gmem_getattr, .setattr = kvm_gmem_setattr, }; static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) { const char *anon_name = "[kvm-gmem]"; struct kvm_gmem *gmem; struct inode *inode; struct file *file; int fd, err; fd = get_unused_fd_flags(0); if (fd < 0) return fd; gmem = kzalloc(sizeof(*gmem), GFP_KERNEL); if (!gmem) { err = -ENOMEM; goto err_fd; } file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem, O_RDWR, NULL); if (IS_ERR(file)) { err = PTR_ERR(file); goto err_gmem; } file->f_flags |= O_LARGEFILE; inode = file->f_inode; WARN_ON(file->f_mapping != inode->i_mapping); inode->i_private = (void *)(unsigned long)flags; inode->i_op = &kvm_gmem_iops; inode->i_mapping->a_ops = &kvm_gmem_aops; inode->i_mapping->flags |= AS_INACCESSIBLE; inode->i_mode |= S_IFREG; inode->i_size = size; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unmovable(inode->i_mapping); /* Unmovable mappings are supposed to be marked unevictable as well. */ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); kvm_get_kvm(kvm); gmem->kvm = kvm; xa_init(&gmem->bindings); list_add(&gmem->entry, &inode->i_mapping->i_private_list); fd_install(fd, file); return fd; err_gmem: kfree(gmem); err_fd: put_unused_fd(fd); return err; } int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) { loff_t size = args->size; u64 flags = args->flags; u64 valid_flags = 0; if (flags & ~valid_flags) return -EINVAL; if (size <= 0 || !PAGE_ALIGNED(size)) return -EINVAL; return __kvm_gmem_create(kvm, size, flags); } int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned int fd, loff_t offset) { loff_t size = slot->npages << PAGE_SHIFT; unsigned long start, end; struct kvm_gmem *gmem; struct inode *inode; struct file *file; int r = -EINVAL; BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff)); file = fget(fd); if (!file) return -EBADF; if (file->f_op != &kvm_gmem_fops) goto err; gmem = file->private_data; if (gmem->kvm != kvm) goto err; inode = file_inode(file); if (offset < 0 || !PAGE_ALIGNED(offset) || offset + size > i_size_read(inode)) goto err; filemap_invalidate_lock(inode->i_mapping); start = offset >> PAGE_SHIFT; end = start + slot->npages; if (!xa_empty(&gmem->bindings) && xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { filemap_invalidate_unlock(inode->i_mapping); goto err; } /* * No synchronize_rcu() needed, any in-flight readers are guaranteed to * be see either a NULL file or this new file, no need for them to go * away. */ rcu_assign_pointer(slot->gmem.file, file); slot->gmem.pgoff = start; xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL); filemap_invalidate_unlock(inode->i_mapping); /* * Drop the reference to the file, even on success. The file pins KVM, * not the other way 'round. Active bindings are invalidated if the * file is closed before memslots are destroyed. */ r = 0; err: fput(file); return r; } void kvm_gmem_unbind(struct kvm_memory_slot *slot) { unsigned long start = slot->gmem.pgoff; unsigned long end = start + slot->npages; struct kvm_gmem *gmem; struct file *file; /* * Nothing to do if the underlying file was already closed (or is being * closed right now), kvm_gmem_release() invalidates all bindings. */ file = kvm_gmem_get_file(slot); if (!file) return; gmem = file->private_data; filemap_invalidate_lock(file->f_mapping); xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL); rcu_assign_pointer(slot->gmem.file, NULL); synchronize_rcu(); filemap_invalidate_unlock(file->f_mapping); fput(file); }