> +KVM_SET_USER_MEMORY_REGION2 is an extension to KVM_SET_USER_MEMORY_REGION that > +allows mapping guest_memfd memory into a guest. All fields shared with > +KVM_SET_USER_MEMORY_REGION identically. Userspace can set KVM_MEM_PRIVATE in > +flags to have KVM bind the memory region to a given guest_memfd range of > +[guest_memfd_offset, guest_memfd_offset + memory_size]. The target guest_memfd ^ The range end should be exclusive, is it? > +must point at a file created via KVM_CREATE_GUEST_MEMFD on the current VM, and > +the target range must not be bound to any other memory region. All standard > +bounds checks apply (use common sense). > + > :: > > struct kvm_userspace_memory_region2 { > @@ -6087,9 +6096,24 @@ applied. > __u64 guest_phys_addr; > __u64 memory_size; /* bytes */ > __u64 userspace_addr; /* start of the userspace allocated memory */ > + __u64 guest_memfd_offset; > + __u32 guest_memfd; > + __u32 pad1; > + __u64 pad2[14]; > }; > [...] > +static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) > +{ > + const char *anon_name = "[kvm-gmem]"; > + struct kvm_gmem *gmem; > + struct inode *inode; > + struct file *file; > + int fd, err; > + > + fd = get_unused_fd_flags(0); > + if (fd < 0) > + return fd; > + > + gmem = kzalloc(sizeof(*gmem), GFP_KERNEL); > + if (!gmem) { > + err = -ENOMEM; > + goto err_fd; > + } > + > + /* > + * Use the so called "secure" variant, which creates a unique inode > + * instead of reusing a single inode. Each guest_memfd instance needs > + * its own inode to track the size, flags, etc. > + */ > + file = anon_inode_getfile_secure(anon_name, &kvm_gmem_fops, gmem, > + O_RDWR, NULL); > + if (IS_ERR(file)) { > + err = PTR_ERR(file); > + goto err_gmem; > + } > + > + file->f_flags |= O_LARGEFILE; > + > + inode = file->f_inode; > + WARN_ON(file->f_mapping != inode->i_mapping); Just curious, why should we check the mapping fields which is garanteed in other subsystem? > + > + inode->i_private = (void *)(unsigned long)flags; > + inode->i_op = &kvm_gmem_iops; > + inode->i_mapping->a_ops = &kvm_gmem_aops; > + inode->i_mode |= S_IFREG; > + inode->i_size = size; > + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); > + mapping_set_unmovable(inode->i_mapping); > + /* Unmovable mappings are supposed to be marked unevictable as well. */ > + WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); > + > + kvm_get_kvm(kvm); > + gmem->kvm = kvm; > + xa_init(&gmem->bindings); > + list_add(&gmem->entry, &inode->i_mapping->private_list); > + > + fd_install(fd, file); > + return fd; > + > +err_gmem: > + kfree(gmem); > +err_fd: > + put_unused_fd(fd); > + return err; > +} [...] > +int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, > + unsigned int fd, loff_t offset) > +{ > + loff_t size = slot->npages << PAGE_SHIFT; > + unsigned long start, end; > + struct kvm_gmem *gmem; > + struct inode *inode; > + struct file *file; > + > + BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff)); > + > + file = fget(fd); > + if (!file) > + return -EBADF; > + > + if (file->f_op != &kvm_gmem_fops) > + goto err; > + > + gmem = file->private_data; > + if (gmem->kvm != kvm) > + goto err; > + > + inode = file_inode(file); > + > + if (offset < 0 || !PAGE_ALIGNED(offset)) > + return -EINVAL; Should also "goto err" here. > + > + if (offset + size > i_size_read(inode)) > + goto err; > + > + filemap_invalidate_lock(inode->i_mapping); > + > + start = offset >> PAGE_SHIFT; > + end = start + slot->npages; > + > + if (!xa_empty(&gmem->bindings) && > + xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { > + filemap_invalidate_unlock(inode->i_mapping); > + goto err; > + } > + > + /* > + * No synchronize_rcu() needed, any in-flight readers are guaranteed to > + * be see either a NULL file or this new file, no need for them to go > + * away. > + */ > + rcu_assign_pointer(slot->gmem.file, file); > + slot->gmem.pgoff = start; > + > + xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL); > + filemap_invalidate_unlock(inode->i_mapping); > + > + /* > + * Drop the reference to the file, even on success. The file pins KVM, > + * not the other way 'round. Active bindings are invalidated if the ^ around? Thanks, Yilun > + * file is closed before memslots are destroyed. > + */ > + fput(file); > + return 0; > + > +err: > + fput(file); > + return -EINVAL; > +}