SNP folks and/or Paolo, what's the plan for this? I don't see how what's sitting in kvm/next can possibly be correct without conditioning population on the folio being !uptodate. On Fri, Apr 26, 2024, Sean Christopherson wrote: > On Fri, Apr 26, 2024, Paolo Bonzini wrote: > > On Thu, Apr 25, 2024 at 6:00 PM Sean Christopherson <seanjc@xxxxxxxxxx> wrote: > > > > > > On Thu, Apr 25, 2024, Paolo Bonzini wrote: > > > > On Thu, Apr 25, 2024 at 3:12 AM Isaku Yamahata <isaku.yamahata@xxxxxxxxx> wrote: > > > > > > > get_user_pages_fast(source addr) > > > > > > > read_lock(mmu_lock) > > > > > > > kvm_tdp_mmu_get_walk_private_pfn(vcpu, gpa, &pfn); > > > > > > > if the page table doesn't map gpa, error. > > > > > > > TDH.MEM.PAGE.ADD() > > > > > > > TDH.MR.EXTEND() > > > > > > > read_unlock(mmu_lock) > > > > > > > put_page() > > > > > > > > > > > > Hmm, KVM doesn't _need_ to use invalidate_lock to protect against guest_memfd > > > > > > invalidation, but I also don't see why it would cause problems. > > > > > > > > The invalidate_lock is only needed to operate on the guest_memfd, but > > > > it's a rwsem so there are no risks of lock inversion. > > > > > > > > > > I.e. why not > > > > > > take mmu_lock() in TDX's post_populate() implementation? > > > > > > > > > > We can take the lock. Because we have already populated the GFN of guest_memfd, > > > > > we need to make kvm_gmem_populate() not pass FGP_CREAT_ONLY. Otherwise we'll > > > > > get -EEXIST. > > > > > > > > I don't understand why TDH.MEM.PAGE.ADD() cannot be called from the > > > > post-populate hook. Can the code for TDH.MEM.PAGE.ADD be shared > > > > between the memory initialization ioctl and the page fault hook in > > > > kvm_x86_ops? > > > > > > Ah, because TDX is required to pre-fault the memory to establish the S-EPT walk, > > > and pre-faulting means guest_memfd() > > > > > > Requiring that guest_memfd not have a page when initializing the guest image > > > seems wrong, i.e. I don't think we want FGP_CREAT_ONLY. And not just because I > > > am a fan of pre-faulting, I think the semantics are bad. > > > > Ok, fair enough. I wanted to do the once-only test in common code but since > > SEV code checks for the RMP I can remove that. One less headache. > > I definitely don't object to having a check in common code, and I'd be in favor > of removing the RMP checks if possible, but tracking needs to be something more > explicit in guest_memfd. > > *sigh* > > I even left behind a TODO for this exact thing, and y'all didn't even wave at it > as you flew by :-) > > /* > * Use the up-to-date flag to track whether or not the memory has been > * zeroed before being handed off to the guest. There is no backing > * storage for the memory, so the folio will remain up-to-date until > * it's removed. > * > * TODO: Skip clearing pages when trusted firmware will do it when <========================== > * assigning memory to the guest. > */ > if (!folio_test_uptodate(folio)) { > unsigned long nr_pages = folio_nr_pages(folio); > unsigned long i; > > for (i = 0; i < nr_pages; i++) > clear_highpage(folio_page(folio, i)); > > folio_mark_uptodate(folio); > } > > if (prepare) { > int r = kvm_gmem_prepare_folio(inode, index, folio); > if (r < 0) { > folio_unlock(folio); > folio_put(folio); > return ERR_PTR(r); > } > } > > Compile tested only (and not even fully as I didn't bother defining > CONFIG_HAVE_KVM_GMEM_INITIALIZE), but I think this is the basic gist. > > 8< -------------------------------- > > // SPDX-License-Identifier: GPL-2.0 > #include <linux/backing-dev.h> > #include <linux/falloc.h> > #include <linux/kvm_host.h> > #include <linux/pagemap.h> > #include <linux/anon_inodes.h> > > #include "kvm_mm.h" > > struct kvm_gmem { > struct kvm *kvm; > struct xarray bindings; > struct list_head entry; > }; > > static int kvm_gmem_initialize_folio(struct kvm *kvm, struct folio *folio, > pgoff_t index, void __user *src, > void *opaque) > { > #ifdef CONFIG_HAVE_KVM_GMEM_INITIALIZE > return kvm_arch_gmem_initialize(kvm, folio, index, src, opaque); > #else > unsigned long nr_pages = folio_nr_pages(folio); > unsigned long i; > > if (WARN_ON_ONCE(src)) > return -EIO; > > for (i = 0; i < nr_pages; i++) > clear_highpage(folio_file_page(folio, index + i)); > #endif > return 0; > } > > > static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) > { > return gfn - slot->base_gfn + slot->gmem.pgoff; > } > > > static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) > { > /* > * Do not return slot->gmem.file if it has already been closed; > * there might be some time between the last fput() and when > * kvm_gmem_release() clears slot->gmem.file, and you do not > * want to spin in the meanwhile. > */ > return get_file_active(&slot->gmem.file); > } > > static struct folio *__kvm_gmem_get_folio(struct inode *inode, pgoff_t index) > { > fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; > struct folio *folio; > > /* > * The caller is responsible for managing the up-to-date flag (or not), > * as the memory doesn't need to be initialized until it's actually > * mapped into the guest. Waiting to initialize memory is necessary > * for VM types where the memory can be initialized exactly once. > * > * Ignore accessed, referenced, and dirty flags. The memory is > * unevictable and there is no storage to write back to. > * > * TODO: Support huge pages. > */ > folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, > mapping_gfp_mask(inode->i_mapping)); > > if (folio_test_hwpoison(folio)) { > folio_unlock(folio); > return ERR_PTR(-EHWPOISON); > } > return folio; > } > > static struct folio *kvm_gmem_get_folio(struct file *file, > struct kvm_memory_slot *slot, > gfn_t gfn) > { > pgoff_t index = kvm_gmem_get_index(slot, gfn); > struct kvm_gmem *gmem = file->private_data; > struct inode *inode; > > if (file != slot->gmem.file) { > WARN_ON_ONCE(slot->gmem.file); > return ERR_PTR(-EFAULT); > } > > gmem = file->private_data; > if (xa_load(&gmem->bindings, index) != slot) { > WARN_ON_ONCE(xa_load(&gmem->bindings, index)); > return ERR_PTR(-EIO); > } > > inode = file_inode(file); > > /* > * The caller is responsible for managing the up-to-date flag (or not), > * as the memory doesn't need to be initialized until it's actually > * mapped into the guest. Waiting to initialize memory is necessary > * for VM types where the memory can be initialized exactly once. > * > * Ignore accessed, referenced, and dirty flags. The memory is > * unevictable and there is no storage to write back to. > * > * TODO: Support huge pages. > */ > return __kvm_gmem_get_folio(inode, index); > } > > int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, > gfn_t gfn, kvm_pfn_t *pfn, int *max_order) > { > pgoff_t index = kvm_gmem_get_index(slot, gfn); > struct file *file = kvm_gmem_get_file(slot); > struct folio *folio; > struct page *page; > > if (!file) > return -EFAULT; > > folio = kvm_gmem_get_folio(file, slot, gfn); > if (IS_ERR(folio)) > goto out; > > if (!folio_test_uptodate(folio)) { > kvm_gmem_initialize_folio(kvm, folio, index, NULL, NULL); > folio_mark_uptodate(folio); > } > > page = folio_file_page(folio, index); > > *pfn = page_to_pfn(page); > if (max_order) > *max_order = 0; > > out: > fput(file); > return IS_ERR(folio) ? PTR_ERR(folio) : 0; > } > EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); > > long kvm_gmem_populate(struct kvm *kvm, gfn_t base_gfn, void __user *base_src, > long npages, void *opaque) > { > struct kvm_memory_slot *slot; > struct file *file; > int ret = 0, max_order; > long i; > > lockdep_assert_held(&kvm->slots_lock); > if (npages < 0) > return -EINVAL; > > slot = gfn_to_memslot(kvm, base_gfn); > if (!kvm_slot_can_be_private(slot)) > return -EINVAL; > > file = kvm_gmem_get_file(slot); > if (!file) > return -EFAULT; > > filemap_invalidate_lock(file->f_mapping); > > npages = min_t(ulong, slot->npages - (base_gfn - slot->base_gfn), npages); > for (i = 0; i < npages; i += (1 << max_order)) { > void __user *src = base_src + i * PAGE_SIZE; > gfn_t gfn = base_gfn + i; > pgoff_t index = kvm_gmem_get_index(slot, gfn); > struct folio *folio; > > folio = kvm_gmem_get_folio(file, slot, gfn); > if (IS_ERR(folio)) { > ret = PTR_ERR(folio); > break; > } > > if (folio_test_uptodate(folio)) { > folio_put(folio); > ret = -EEXIST; > break; > } > > kvm_gmem_initialize_folio(kvm, folio, index, src, opaque); > folio_unlock(folio); > folio_put(folio); > } > > filemap_invalidate_unlock(file->f_mapping); > > fput(file); > return ret && !i ? ret : i; > } > EXPORT_SYMBOL_GPL(kvm_gmem_populate); > > static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, > pgoff_t end) > { > bool flush = false, found_memslot = false; > struct kvm_memory_slot *slot; > struct kvm *kvm = gmem->kvm; > unsigned long index; > > xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) { > pgoff_t pgoff = slot->gmem.pgoff; > > struct kvm_gfn_range gfn_range = { > .start = slot->base_gfn + max(pgoff, start) - pgoff, > .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff, > .slot = slot, > .may_block = true, > }; > > if (!found_memslot) { > found_memslot = true; > > KVM_MMU_LOCK(kvm); > kvm_mmu_invalidate_begin(kvm); > } > > flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range); > } > > if (flush) > kvm_flush_remote_tlbs(kvm); > > if (found_memslot) > KVM_MMU_UNLOCK(kvm); > } > > static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, > pgoff_t end) > { > struct kvm *kvm = gmem->kvm; > > if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { > KVM_MMU_LOCK(kvm); > kvm_mmu_invalidate_end(kvm); > KVM_MMU_UNLOCK(kvm); > } > } > > static long __kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) > { > struct list_head *gmem_list = &inode->i_mapping->i_private_list; > pgoff_t start = offset >> PAGE_SHIFT; > pgoff_t end = (offset + len) >> PAGE_SHIFT; > struct kvm_gmem *gmem; > list_for_each_entry(gmem, gmem_list, entry) > kvm_gmem_invalidate_begin(gmem, start, end); > > truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); > > list_for_each_entry(gmem, gmem_list, entry) > kvm_gmem_invalidate_end(gmem, start, end); > > return 0; > } > > static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) > { > int r; > > /* > * Bindings must be stable across invalidation to ensure the start+end > * are balanced. > */ > filemap_invalidate_lock(inode->i_mapping); > r = __kvm_gmem_punch_hole(inode, offset, len); > filemap_invalidate_unlock(inode->i_mapping); > return r; > } > > static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) > { > struct address_space *mapping = inode->i_mapping; > pgoff_t start, index, end; > int r; > > /* Dedicated guest is immutable by default. */ > if (offset + len > i_size_read(inode)) > return -EINVAL; > > filemap_invalidate_lock_shared(mapping); > > start = offset >> PAGE_SHIFT; > end = (offset + len) >> PAGE_SHIFT; > > r = 0; > for (index = start; index < end; ) { > struct folio *folio; > > if (signal_pending(current)) { > r = -EINTR; > break; > } > > folio = __kvm_gmem_get_folio(inode, index); > if (IS_ERR(folio)) { > r = PTR_ERR(folio); > break; > } > > index = folio_next_index(folio); > > folio_unlock(folio); > folio_put(folio); > > /* 64-bit only, wrapping the index should be impossible. */ > if (WARN_ON_ONCE(!index)) > break; > > cond_resched(); > } > > filemap_invalidate_unlock_shared(mapping); > > return r; > } > > static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, > loff_t len) > { > int ret; > > if (!(mode & FALLOC_FL_KEEP_SIZE)) > return -EOPNOTSUPP; > > if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) > return -EOPNOTSUPP; > > if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) > return -EINVAL; > > if (mode & FALLOC_FL_PUNCH_HOLE) > ret = kvm_gmem_punch_hole(file_inode(file), offset, len); > else > ret = kvm_gmem_allocate(file_inode(file), offset, len); > > if (!ret) > file_modified(file); > return ret; > } > > static int kvm_gmem_release(struct inode *inode, struct file *file) > { > struct kvm_gmem *gmem = file->private_data; > struct kvm_memory_slot *slot; > struct kvm *kvm = gmem->kvm; > unsigned long index; > > /* > * Prevent concurrent attempts to *unbind* a memslot. This is the last > * reference to the file and thus no new bindings can be created, but > * dereferencing the slot for existing bindings needs to be protected > * against memslot updates, specifically so that unbind doesn't race > * and free the memslot (kvm_gmem_get_file() will return NULL). > */ > mutex_lock(&kvm->slots_lock); > > filemap_invalidate_lock(inode->i_mapping); > > xa_for_each(&gmem->bindings, index, slot) > rcu_assign_pointer(slot->gmem.file, NULL); > > synchronize_rcu(); > > /* > * All in-flight operations are gone and new bindings can be created. > * Zap all SPTEs pointed at by this file. Do not free the backing > * memory, as its lifetime is associated with the inode, not the file. > */ > kvm_gmem_invalidate_begin(gmem, 0, -1ul); > kvm_gmem_invalidate_end(gmem, 0, -1ul); > > list_del(&gmem->entry); > > filemap_invalidate_unlock(inode->i_mapping); > > mutex_unlock(&kvm->slots_lock); > > xa_destroy(&gmem->bindings); > kfree(gmem); > > kvm_put_kvm(kvm); > > return 0; > } > > static struct file_operations kvm_gmem_fops = { > .open = generic_file_open, > .release = kvm_gmem_release, > .fallocate = kvm_gmem_fallocate, > }; > > void kvm_gmem_init(struct module *module) > { > kvm_gmem_fops.owner = module; > } > > static int kvm_gmem_migrate_folio(struct address_space *mapping, > struct folio *dst, struct folio *src, > enum migrate_mode mode) > { > WARN_ON_ONCE(1); > return -EINVAL; > } > > static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio) > { > struct list_head *gmem_list = &mapping->i_private_list; > struct kvm_gmem *gmem; > pgoff_t start, end; > > filemap_invalidate_lock_shared(mapping); > > start = folio->index; > end = start + folio_nr_pages(folio); > > list_for_each_entry(gmem, gmem_list, entry) > kvm_gmem_invalidate_begin(gmem, start, end); > > /* > * Do not truncate the range, what action is taken in response to the > * error is userspace's decision (assuming the architecture supports > * gracefully handling memory errors). If/when the guest attempts to > * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON, > * at which point KVM can either terminate the VM or propagate the > * error to userspace. > */ > > list_for_each_entry(gmem, gmem_list, entry) > kvm_gmem_invalidate_end(gmem, start, end); > > filemap_invalidate_unlock_shared(mapping); > > return MF_DELAYED; > } > > #ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE > static void kvm_gmem_free_folio(struct folio *folio) > { > struct page *page = folio_page(folio, 0); > kvm_pfn_t pfn = page_to_pfn(page); > int order = folio_order(folio); > > kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); > } > #endif > > static const struct address_space_operations kvm_gmem_aops = { > .dirty_folio = noop_dirty_folio, > .migrate_folio = kvm_gmem_migrate_folio, > .error_remove_folio = kvm_gmem_error_folio, > #ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE > .free_folio = kvm_gmem_free_folio, > #endif > }; > > static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path, > struct kstat *stat, u32 request_mask, > unsigned int query_flags) > { > struct inode *inode = path->dentry->d_inode; > > generic_fillattr(idmap, request_mask, inode, stat); > return 0; > } > > static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, > struct iattr *attr) > { > return -EINVAL; > } > static const struct inode_operations kvm_gmem_iops = { > .getattr = kvm_gmem_getattr, > .setattr = kvm_gmem_setattr, > }; > > static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) > { > const char *anon_name = "[kvm-gmem]"; > struct kvm_gmem *gmem; > struct inode *inode; > struct file *file; > int fd, err; > > fd = get_unused_fd_flags(0); > if (fd < 0) > return fd; > > gmem = kzalloc(sizeof(*gmem), GFP_KERNEL); > if (!gmem) { > err = -ENOMEM; > goto err_fd; > } > > file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem, > O_RDWR, NULL); > if (IS_ERR(file)) { > err = PTR_ERR(file); > goto err_gmem; > } > > file->f_flags |= O_LARGEFILE; > > inode = file->f_inode; > WARN_ON(file->f_mapping != inode->i_mapping); > > inode->i_private = (void *)(unsigned long)flags; > inode->i_op = &kvm_gmem_iops; > inode->i_mapping->a_ops = &kvm_gmem_aops; > inode->i_mapping->flags |= AS_INACCESSIBLE; > inode->i_mode |= S_IFREG; > inode->i_size = size; > mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); > mapping_set_unmovable(inode->i_mapping); > /* Unmovable mappings are supposed to be marked unevictable as well. */ > WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); > > kvm_get_kvm(kvm); > gmem->kvm = kvm; > xa_init(&gmem->bindings); > list_add(&gmem->entry, &inode->i_mapping->i_private_list); > > fd_install(fd, file); > return fd; > > err_gmem: > kfree(gmem); > err_fd: > put_unused_fd(fd); > return err; > } > > int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) > { > loff_t size = args->size; > u64 flags = args->flags; > u64 valid_flags = 0; > > if (flags & ~valid_flags) > return -EINVAL; > > if (size <= 0 || !PAGE_ALIGNED(size)) > return -EINVAL; > > return __kvm_gmem_create(kvm, size, flags); > } > > int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, > unsigned int fd, loff_t offset) > { > loff_t size = slot->npages << PAGE_SHIFT; > unsigned long start, end; > struct kvm_gmem *gmem; > struct inode *inode; > struct file *file; > int r = -EINVAL; > > BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff)); > > file = fget(fd); > if (!file) > return -EBADF; > > if (file->f_op != &kvm_gmem_fops) > goto err; > > gmem = file->private_data; > if (gmem->kvm != kvm) > goto err; > > inode = file_inode(file); > > if (offset < 0 || !PAGE_ALIGNED(offset) || > offset + size > i_size_read(inode)) > goto err; > > filemap_invalidate_lock(inode->i_mapping); > > start = offset >> PAGE_SHIFT; > end = start + slot->npages; > > if (!xa_empty(&gmem->bindings) && > xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { > filemap_invalidate_unlock(inode->i_mapping); > goto err; > } > > /* > * No synchronize_rcu() needed, any in-flight readers are guaranteed to > * be see either a NULL file or this new file, no need for them to go > * away. > */ > rcu_assign_pointer(slot->gmem.file, file); > slot->gmem.pgoff = start; > > xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL); > filemap_invalidate_unlock(inode->i_mapping); > > /* > * Drop the reference to the file, even on success. The file pins KVM, > * not the other way 'round. Active bindings are invalidated if the > * file is closed before memslots are destroyed. > */ > r = 0; > err: > fput(file); > return r; > } > > void kvm_gmem_unbind(struct kvm_memory_slot *slot) > { > unsigned long start = slot->gmem.pgoff; > unsigned long end = start + slot->npages; > struct kvm_gmem *gmem; > struct file *file; > > /* > * Nothing to do if the underlying file was already closed (or is being > * closed right now), kvm_gmem_release() invalidates all bindings. > */ > file = kvm_gmem_get_file(slot); > if (!file) > return; > > gmem = file->private_data; > > filemap_invalidate_lock(file->f_mapping); > xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL); > rcu_assign_pointer(slot->gmem.file, NULL); > synchronize_rcu(); > filemap_invalidate_unlock(file->f_mapping); > > fput(file); > } >