Hi Gavin, On Fri, 24 Jan 2025 at 04:26, Gavin Shan <gshan@xxxxxxxxxx> wrote: > > Hi Fuad, > > On 1/18/25 2:29 AM, Fuad Tabba wrote: > > From: Ackerley Tng <ackerleytng@xxxxxxxxxx> > > > > Using guest mem inodes allows us to store metadata for the backing > > memory on the inode. Metadata will be added in a later patch to > > support HugeTLB pages. > > > > Metadata about backing memory should not be stored on the file, since > > the file represents a guest_memfd's binding with a struct kvm, and > > metadata about backing memory is not unique to a specific binding and > > struct kvm. > > > > Signed-off-by: Ackerley Tng <ackerleytng@xxxxxxxxxx> > > Signed-off-by: Fuad Tabba <tabba@xxxxxxxxxx> > > --- > > include/uapi/linux/magic.h | 1 + > > virt/kvm/guest_memfd.c | 119 ++++++++++++++++++++++++++++++------- > > 2 files changed, 100 insertions(+), 20 deletions(-) > > > > diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h > > index bb575f3ab45e..169dba2a6920 100644 > > --- a/include/uapi/linux/magic.h > > +++ b/include/uapi/linux/magic.h > > @@ -103,5 +103,6 @@ > > #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */ > > #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ > > #define PID_FS_MAGIC 0x50494446 /* "PIDF" */ > > +#define GUEST_MEMORY_MAGIC 0x474d454d /* "GMEM" */ > > > > #endif /* __LINUX_MAGIC_H__ */ > > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > > index 47a9f68f7b24..198554b1f0b5 100644 > > --- a/virt/kvm/guest_memfd.c > > +++ b/virt/kvm/guest_memfd.c > > @@ -1,12 +1,17 @@ > > // SPDX-License-Identifier: GPL-2.0 > > +#include <linux/fs.h> > > +#include <linux/mount.h> > > This can be dropped since "linux/mount.h" has been included to "linux/fs.h". > > > #include <linux/backing-dev.h> > > #include <linux/falloc.h> > > #include <linux/kvm_host.h> > > +#include <linux/pseudo_fs.h> > > #include <linux/pagemap.h> > > #include <linux/anon_inodes.h> > > > > #include "kvm_mm.h" > > > > +static struct vfsmount *kvm_gmem_mnt; > > + > > struct kvm_gmem { > > struct kvm *kvm; > > struct xarray bindings; > > @@ -307,6 +312,38 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) > > return gfn - slot->base_gfn + slot->gmem.pgoff; > > } > > > > +static const struct super_operations kvm_gmem_super_operations = { > > + .statfs = simple_statfs, > > +}; > > + > > +static int kvm_gmem_init_fs_context(struct fs_context *fc) > > +{ > > + struct pseudo_fs_context *ctx; > > + > > + if (!init_pseudo(fc, GUEST_MEMORY_MAGIC)) > > + return -ENOMEM; > > + > > + ctx = fc->fs_private; > > + ctx->ops = &kvm_gmem_super_operations; > > + > > + return 0; > > +} > > + > > +static struct file_system_type kvm_gmem_fs = { > > + .name = "kvm_guest_memory", > > + .init_fs_context = kvm_gmem_init_fs_context, > > + .kill_sb = kill_anon_super, > > +}; > > + > > +static void kvm_gmem_init_mount(void) > > +{ > > + kvm_gmem_mnt = kern_mount(&kvm_gmem_fs); > > + BUG_ON(IS_ERR(kvm_gmem_mnt)); > > + > > + /* For giggles. Userspace can never map this anyways. */ > > + kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC; > > +} > > + > > static struct file_operations kvm_gmem_fops = { > > .open = generic_file_open, > > .release = kvm_gmem_release, > > @@ -316,6 +353,8 @@ static struct file_operations kvm_gmem_fops = { > > void kvm_gmem_init(struct module *module) > > { > > kvm_gmem_fops.owner = module; > > + > > + kvm_gmem_init_mount(); > > } > > > > static int kvm_gmem_migrate_folio(struct address_space *mapping, > > @@ -397,11 +436,67 @@ static const struct inode_operations kvm_gmem_iops = { > > .setattr = kvm_gmem_setattr, > > }; > > > > +static struct inode *kvm_gmem_inode_make_secure_inode(const char *name, > > + loff_t size, u64 flags) > > +{ > > + const struct qstr qname = QSTR_INIT(name, strlen(name)); > > + struct inode *inode; > > + int err; > > + > > + inode = alloc_anon_inode(kvm_gmem_mnt->mnt_sb); > > + if (IS_ERR(inode)) > > + return inode; > > + > > + err = security_inode_init_security_anon(inode, &qname, NULL); > > + if (err) { > > + iput(inode); > > + return ERR_PTR(err); > > + } > > + > > + inode->i_private = (void *)(unsigned long)flags; > > + inode->i_op = &kvm_gmem_iops; > > + inode->i_mapping->a_ops = &kvm_gmem_aops; > > + inode->i_mode |= S_IFREG; > > + inode->i_size = size; > > + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); > > + mapping_set_inaccessible(inode->i_mapping); > > + /* Unmovable mappings are supposed to be marked unevictable as well. */ > > + WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); > > + > > + return inode; > > +} > > + > > +static struct file *kvm_gmem_inode_create_getfile(void *priv, loff_t size, > > + u64 flags) > > +{ > > + static const char *name = "[kvm-gmem]"; > > + struct inode *inode; > > + struct file *file; > > + > > + if (kvm_gmem_fops.owner && !try_module_get(kvm_gmem_fops.owner)) > > + return ERR_PTR(-ENOENT); > > + > > The validation on 'kvm_gmem_fops.owner' can be removed since try_module_get() > and module_put() are friendly to a NULL parameter, even when CONFIG_MODULE_UNLOAD == N > > A module_put(kvm_gmem_fops.owner) is needed in the various erroneous cases in > this function. Otherwise, the reference count of the owner (module) will become > imbalanced on any errors. > > > > + inode = kvm_gmem_inode_make_secure_inode(name, size, flags); > > + if (IS_ERR(inode)) > > + return ERR_CAST(inode); > > + > > ERR_CAST may be dropped since there is nothing to be casted or converted? > > > + file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, > > + &kvm_gmem_fops); > > + if (IS_ERR(file)) { > > + iput(inode); > > + return file; > > + } > > + > > + file->f_mapping = inode->i_mapping; > > + file->f_flags |= O_LARGEFILE; > > + file->private_data = priv; > > + > > 'file->f_mapping = inode->i_mapping' may be dropped since it's already correctly > set by alloc_file_pseudo(). > > alloc_file_pseudo > alloc_path_pseudo > alloc_file > alloc_empty_file > file_init_path // Set by this function Thanks for the fixes. Will include them when we respin. Cheers, /fuad > > + return file; > > +} > > + > > static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) > > { > > - const char *anon_name = "[kvm-gmem]"; > > struct kvm_gmem *gmem; > > - struct inode *inode; > > struct file *file; > > int fd, err; > > > > @@ -415,32 +510,16 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) > > goto err_fd; > > } > > > > - file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem, > > - O_RDWR, NULL); > > + file = kvm_gmem_inode_create_getfile(gmem, size, flags); > > if (IS_ERR(file)) { > > err = PTR_ERR(file); > > goto err_gmem; > > } > > > > - file->f_flags |= O_LARGEFILE; > > - > > - inode = file->f_inode; > > - WARN_ON(file->f_mapping != inode->i_mapping); > > - > > - inode->i_private = (void *)(unsigned long)flags; > > - inode->i_op = &kvm_gmem_iops; > > - inode->i_mapping->a_ops = &kvm_gmem_aops; > > - inode->i_mode |= S_IFREG; > > - inode->i_size = size; > > - mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); > > - mapping_set_inaccessible(inode->i_mapping); > > - /* Unmovable mappings are supposed to be marked unevictable as well. */ > > - WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); > > - > > kvm_get_kvm(kvm); > > gmem->kvm = kvm; > > xa_init(&gmem->bindings); > > - list_add(&gmem->entry, &inode->i_mapping->i_private_list); > > + list_add(&gmem->entry, &file_inode(file)->i_mapping->i_private_list); > > > > fd_install(fd, file); > > return fd; > > Thanks, > Gavin >