On 10/26/2017 06:20 AM, marcandre.lureau@xxxxxxxxxx wrote: > From: Marc-André Lureau <marcandre.lureau@xxxxxxxxxx> > > Hi, > > Recently, Mike Kravetz added hugetlbfs support to memfd. However, he > didn't need sealing support. One of the reasons to use memfd is to > have shared memory sealing when doing IPC. Qemu uses shared memory & > hugetables with vhost-user (used by dpdk). I am adding memfd support > in qemu, for convenience and security reasons. It would be great if > hugetlbfs had sealing support. I don't have much experience with > kernel & mm, but I came up with the following patch. It works with a > slightly modified memfd_test.c, enabling most tests again (only > pwrite() has to be skipped). I would like to have some early feedback > before I send a cleaned up series on the lkml. > This is the first time I have looked closely at the sealing code, so I may have missed something. However, in general the changes seem reasonable and follow what was done for sealing in existing code. Just a few nits below. > Thanks! > > Signed-off-by: Marc-André Lureau <marcandre.lureau@xxxxxxxxxx> > --- > fs/hugetlbfs/inode.c | 36 ++++++++++++++++++++++++---------- > include/linux/hugetlb.h | 11 +++++++++++ > include/linux/shmem_fs.h | 2 -- > mm/shmem.c | 51 +++++++++++++++++++++++++----------------------- > 4 files changed, 64 insertions(+), 36 deletions(-) > > diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c > index 59073e9f01a4..a39c315e6d0c 100644 > --- a/fs/hugetlbfs/inode.c > +++ b/fs/hugetlbfs/inode.c > @@ -55,16 +55,6 @@ struct hugetlbfs_config { > umode_t mode; > }; > > -struct hugetlbfs_inode_info { > - struct shared_policy policy; > - struct inode vfs_inode; > -}; > - > -static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) > -{ > - return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); > -} > - > int sysctl_hugetlb_shm_group; > > enum { > @@ -520,8 +510,16 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) > > if (hole_end > hole_start) { > struct address_space *mapping = inode->i_mapping; > + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); > > inode_lock(inode); > + > + /* protected by i_mutex */ > + if (info->seals & F_SEAL_WRITE) { > + inode_unlock(inode); > + return -EPERM; > + } > + > i_mmap_lock_write(mapping); > if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) > hugetlb_vmdelete_list(&mapping->i_mmap, > @@ -539,6 +537,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, > loff_t len) > { > struct inode *inode = file_inode(file); > + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); > struct address_space *mapping = inode->i_mapping; > struct hstate *h = hstate_inode(inode); > struct vm_area_struct pseudo_vma; > @@ -570,6 +569,12 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, > if (error) > goto out; > > + /* do we need this if FL_KEEP_SIZE is mandatory? let's be careful */ Not sure I understand "FL_KEEP_SIZE is mandatory". It is not mandatory. So, the code/check is required. > + if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { > + error = -EPERM; > + goto out; > + } > + > /* > * Initialize a pseudo vma as this is required by the huge page > * allocation routines. If NUMA is configured, use page index > @@ -660,6 +665,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) > struct hstate *h = hstate_inode(inode); > int error; > unsigned int ia_valid = attr->ia_valid; > + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); > > BUG_ON(!inode); > > @@ -668,9 +674,16 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) > return error; > > if (ia_valid & ATTR_SIZE) { > + loff_t oldsize = inode->i_size; > + loff_t newsize = attr->ia_size; > + > error = -EINVAL; > if (attr->ia_size & ~huge_page_mask(h)) Might want to change 'attr->ia_size' to newsize here. > return -EINVAL; > + /* protected by i_mutex */ > + if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || > + (newsize > oldsize && (info->seals & F_SEAL_GROW))) > + return -EPERM; > error = hugetlb_vmtruncate(inode, attr->ia_size); and here as well. > if (error) > return error; > @@ -723,6 +736,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, > > inode = new_inode(sb); > if (inode) { > + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); > + > inode->i_ino = get_next_ino(); > inode_init_owner(inode, dir, mode); > lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, > @@ -730,6 +745,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, > inode->i_mapping->a_ops = &hugetlbfs_aops; > inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); > inode->i_mapping->private_data = resv_map; > + info->seals = F_SEAL_SEAL; > switch (mode & S_IFMT) { > default: > init_special_inode(inode, mode, dev); > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > index 8bbbd37ab105..42945aa3bcc1 100644 > --- a/include/linux/hugetlb.h > +++ b/include/linux/hugetlb.h > @@ -44,6 +44,17 @@ extern int gup_huge_pd(hugepd_t hugepd, unsigned long addr, > #include <linux/shm.h> > #include <asm/tlbflush.h> > > +struct hugetlbfs_inode_info { > + struct shared_policy policy; > + struct inode vfs_inode; > + unsigned int seals; > +}; > + > +static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) > +{ > + return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); > +} > + This header file is a mess (not your fault!). I would suggest putting this after: #ifdef CONFIG_HUGETLBFS <and super block information> > struct hugepage_subpool { > spinlock_t lock; > long count; > diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h > index b6c3540e07bc..557d0c3b6eca 100644 > --- a/include/linux/shmem_fs.h > +++ b/include/linux/shmem_fs.h > @@ -109,8 +109,6 @@ extern void shmem_uncharge(struct inode *inode, long pages); > > #ifdef CONFIG_TMPFS > > -extern int shmem_add_seals(struct file *file, unsigned int seals); > -extern int shmem_get_seals(struct file *file); > extern long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg); > > #else > diff --git a/mm/shmem.c b/mm/shmem.c > index 07a1d22807be..6ceb5f43cb10 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -2722,10 +2722,10 @@ static int shmem_wait_for_pins(struct address_space *mapping) > F_SEAL_GROW | \ > F_SEAL_WRITE) > > -int shmem_add_seals(struct file *file, unsigned int seals) > +static int memfd_add_seals(struct file *file, unsigned int seals) > { > struct inode *inode = file_inode(file); > - struct shmem_inode_info *info = SHMEM_I(inode); > + unsigned int *info_seals; > int error; > > /* > @@ -2758,8 +2758,13 @@ int shmem_add_seals(struct file *file, unsigned int seals) > * other file types. > */ > > - if (file->f_op != &shmem_file_operations) > + if (file->f_op == &shmem_file_operations) > + info_seals = &SHMEM_I(inode)->seals; > + else if (file->f_op == &hugetlbfs_file_operations) > + info_seals = &HUGETLBFS_I(inode)->seals; > + else > return -EINVAL; > + > if (!(file->f_mode & FMODE_WRITE)) > return -EPERM; > if (seals & ~(unsigned int)F_ALL_SEALS) > @@ -2767,12 +2772,12 @@ int shmem_add_seals(struct file *file, unsigned int seals) > > inode_lock(inode); > > - if (info->seals & F_SEAL_SEAL) { > + if (*info_seals & F_SEAL_SEAL) { > error = -EPERM; > goto unlock; > } > > - if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) { > + if ((seals & F_SEAL_WRITE) && !(*info_seals & F_SEAL_WRITE)) { > error = mapping_deny_writable(file->f_mapping); > if (error) > goto unlock; > @@ -2784,23 +2789,24 @@ int shmem_add_seals(struct file *file, unsigned int seals) > } > } > > - info->seals |= seals; > + *info_seals |= seals; > error = 0; > > unlock: > inode_unlock(inode); > return error; > } > -EXPORT_SYMBOL_GPL(shmem_add_seals); > > -int shmem_get_seals(struct file *file) > +static int memfd_get_seals(struct file *file) > { > - if (file->f_op != &shmem_file_operations) > - return -EINVAL; > + if (file->f_op == &shmem_file_operations) > + return SHMEM_I(file_inode(file))->seals; > + > + if (file->f_op == &hugetlbfs_file_operations) > + return HUGETLBFS_I(file_inode(file))->seals; > > - return SHMEM_I(file_inode(file))->seals; > + return -EINVAL; > } > -EXPORT_SYMBOL_GPL(shmem_get_seals); I can see why the routine name change makes sense. However, I have no idea why shmem_add/get_seals are extern and exported today. Perhaps Hugh or someone else can comment? -- Mike Kravetz > > long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) > { > @@ -2812,10 +2818,10 @@ long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) > if (arg > UINT_MAX) > return -EINVAL; > > - error = shmem_add_seals(file, arg); > + error = memfd_add_seals(file, arg); > break; > case F_GET_SEALS: > - error = shmem_get_seals(file); > + error = memfd_get_seals(file); > break; > default: > error = -EINVAL; > @@ -3659,19 +3665,16 @@ SYSCALL_DEFINE2(memfd_create, > const char __user *, uname, > unsigned int, flags) > { > - struct shmem_inode_info *info; > struct file *file; > int fd, error; > char *name; > long len; > + unsigned int *info_seals; > > if (!(flags & MFD_HUGETLB)) { > if (flags & ~(unsigned int)MFD_ALL_FLAGS) > return -EINVAL; > } else { > - /* Sealing not supported in hugetlbfs (MFD_HUGETLB) */ > - if (flags & MFD_ALLOW_SEALING) > - return -EINVAL; > /* Allow huge page size encoding in flags. */ > if (flags & ~(unsigned int)(MFD_ALL_FLAGS | > (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) > @@ -3723,13 +3726,13 @@ SYSCALL_DEFINE2(memfd_create, > file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; > file->f_flags |= O_RDWR | O_LARGEFILE; > > + if (flags & MFD_HUGETLB) > + info_seals = &HUGETLBFS_I(file_inode(file))->seals; > + else > + info_seals = &SHMEM_I(file_inode(file))->seals; > + > if (flags & MFD_ALLOW_SEALING) { > - /* > - * flags check at beginning of function ensures > - * this is not a hugetlbfs (MFD_HUGETLB) file. > - */ > - info = SHMEM_I(file_inode(file)); > - info->seals &= ~F_SEAL_SEAL; > + *info_seals &= ~F_SEAL_SEAL; > } > > fd_install(fd, file); > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>