On Fri, Mar 31, 2023 at 11:50:39PM +0000, Ackerley Tng wrote: > By default, the backing shmem file for a restrictedmem fd is created > on shmem's kernel space mount. > > With this patch, an optional tmpfs mount can be specified via an fd, > which will be used as the mountpoint for backing the shmem file > associated with a restrictedmem fd. > > This will help restrictedmem fds inherit the properties of the > provided tmpfs mounts, for example, hugepage allocation hints, NUMA > binding hints, etc. > > Permissions for the fd passed to memfd_restricted() is modeled after > the openat() syscall, since both of these allow creation of a file > upon a mount/directory. > > Permission to reference the mount the fd represents is checked upon fd > creation by other syscalls (e.g. fsmount(), open(), or open_tree(), > etc) and any process that can present memfd_restricted() with a valid > fd is expected to have obtained permission to use the mount > represented by the fd. This behavior is intended to parallel that of > the openat() syscall. > > memfd_restricted() will check that the tmpfs superblock is > writable, and that the mount is also writable, before attempting to > create a restrictedmem file on the mount. > > Signed-off-by: Ackerley Tng <ackerleytng@xxxxxxxxxx> > --- > include/linux/syscalls.h | 2 +- > include/uapi/linux/restrictedmem.h | 8 ++++ > mm/restrictedmem.c | 74 +++++++++++++++++++++++++++--- > 3 files changed, 77 insertions(+), 7 deletions(-) > create mode 100644 include/uapi/linux/restrictedmem.h > > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > index f9e9e0c820c5..a23c4c385cd3 100644 > --- a/include/linux/syscalls.h > +++ b/include/linux/syscalls.h > @@ -1056,7 +1056,7 @@ asmlinkage long sys_memfd_secret(unsigned int flags); > asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len, > unsigned long home_node, > unsigned long flags); > -asmlinkage long sys_memfd_restricted(unsigned int flags); > +asmlinkage long sys_memfd_restricted(unsigned int flags, int mount_fd); > > /* > * Architecture-specific system calls > diff --git a/include/uapi/linux/restrictedmem.h b/include/uapi/linux/restrictedmem.h > new file mode 100644 > index 000000000000..22d6f2285f6d > --- /dev/null > +++ b/include/uapi/linux/restrictedmem.h > @@ -0,0 +1,8 @@ > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ > +#ifndef _UAPI_LINUX_RESTRICTEDMEM_H > +#define _UAPI_LINUX_RESTRICTEDMEM_H > + > +/* flags for memfd_restricted */ > +#define RMFD_USERMNT 0x0001U > + > +#endif /* _UAPI_LINUX_RESTRICTEDMEM_H */ > diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c > index c5d869d8c2d8..f7b62364a31a 100644 > --- a/mm/restrictedmem.c > +++ b/mm/restrictedmem.c > @@ -1,11 +1,12 @@ > // SPDX-License-Identifier: GPL-2.0 > -#include "linux/sbitmap.h" > +#include <linux/namei.h> > #include <linux/pagemap.h> > #include <linux/pseudo_fs.h> > #include <linux/shmem_fs.h> > #include <linux/syscalls.h> > #include <uapi/linux/falloc.h> > #include <uapi/linux/magic.h> > +#include <uapi/linux/restrictedmem.h> > #include <linux/restrictedmem.h> > > struct restrictedmem { > @@ -189,19 +190,20 @@ static struct file *restrictedmem_file_create(struct file *memfd) > return file; > } > > -SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags) > +static int restrictedmem_create(struct vfsmount *mount) > { > struct file *file, *restricted_file; > int fd, err; > > - if (flags) > - return -EINVAL; > - > fd = get_unused_fd_flags(0); > if (fd < 0) > return fd; > > - file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE); > + if (mount) > + file = shmem_file_setup_with_mnt(mount, "memfd:restrictedmem", 0, VM_NORESERVE); > + else > + file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE); > + > if (IS_ERR(file)) { > err = PTR_ERR(file); > goto err_fd; > @@ -223,6 +225,66 @@ SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags) > return err; > } > > +static bool is_shmem_mount(struct vfsmount *mnt) > +{ > + return mnt && mnt->mnt_sb && mnt->mnt_sb->s_magic == TMPFS_MAGIC; > +} > + > +static bool is_mount_root(struct file *file) > +{ > + return file->f_path.dentry == file->f_path.mnt->mnt_root; > +} > + > +static int restrictedmem_create_on_user_mount(int mount_fd) > +{ > + int ret; > + struct fd f; > + struct vfsmount *mnt; > + > + f = fdget_raw(mount_fd); > + if (!f.file) > + return -EBADF; > + > + ret = -EINVAL; > + if (!is_mount_root(f.file)) > + goto out; > + > + mnt = f.file->f_path.mnt; > + if (!is_shmem_mount(mnt)) > + goto out; > + > + ret = file_permission(f.file, MAY_WRITE | MAY_EXEC); Why MAY_EXEC? > + if (ret) > + goto out; > + > + ret = mnt_want_write(mnt); > + if (unlikely(ret)) > + goto out; > + > + ret = restrictedmem_create(mnt); > + > + mnt_drop_write(mnt); > +out: > + fdput(f); > + > + return ret; > +} We need review from fs folks. Look mostly sensible, but I have no experience in fs. > + > +SYSCALL_DEFINE2(memfd_restricted, unsigned int, flags, int, mount_fd) > +{ > + if (flags & ~RMFD_USERMNT) > + return -EINVAL; > + > + if (flags == RMFD_USERMNT) { > + if (mount_fd < 0) > + return -EINVAL; > + > + return restrictedmem_create_on_user_mount(mount_fd); > + } else { > + return restrictedmem_create(NULL); > + } Maybe restructure with single restrictedmem_create() call? struct vfsmount *mnt = NULL; if (flags == RMFD_USERMNT) { ... mnt = ...(); } return restrictedmem_create(mnt); > +} > + > int restrictedmem_bind(struct file *file, pgoff_t start, pgoff_t end, > struct restrictedmem_notifier *notifier, bool exclusive) > { > -- > 2.40.0.348.gf938b09366-goog -- Kiryl Shutsemau / Kirill A. Shutemov