On Sat, Aug 20, 2022 at 10:15:32PM -0700, Hugh Dickins wrote: > > I will try next week to rework it as shim to top of shmem. Does it work > > for you? > > Yes, please do, thanks. It's a compromise between us: the initial TDX > case has no justification to use shmem at all, but doing it that way > will help you with some of the infrastructure, and will probably be > easiest for KVM to extend to other more relaxed fd cases later. Okay, below is my take on the shim approach. I don't hate how it turned out. It is easier to understand without callback exchange thing. The only caveat is I had to introduce external lock to protect against race between lookup and truncate. Otherwise, looks pretty reasonable to me. I did very limited testing. And it lacks integration with KVM, but API changed not substantially, any it should be easy to adopt. Any comments? diff --git a/include/linux/memfd.h b/include/linux/memfd.h index 4f1600413f91..aec04a0f8b7b 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -3,6 +3,7 @@ #define __LINUX_MEMFD_H #include <linux/file.h> +#include <linux/pfn_t.h> #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg); @@ -13,4 +14,27 @@ static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a) } #endif +struct inaccessible_notifier; + +struct inaccessible_notifier_ops { + void (*invalidate)(struct inaccessible_notifier *notifier, + pgoff_t start, pgoff_t end); +}; + +struct inaccessible_notifier { + struct list_head list; + const struct inaccessible_notifier_ops *ops; +}; + +int inaccessible_register_notifier(struct file *file, + struct inaccessible_notifier *notifier); +void inaccessible_unregister_notifier(struct file *file, + struct inaccessible_notifier *notifier); + +int inaccessible_get_pfn(struct file *file, pgoff_t offset, pfn_t *pfn, + int *order); +void inaccessible_put_pfn(struct file *file, pfn_t pfn); + +struct file *memfd_mkinaccessible(struct file *memfd); + #endif /* __LINUX_MEMFD_H */ diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 6325d1d0e90f..9d066be3d7e8 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -101,5 +101,6 @@ #define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */ #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */ #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ +#define INACCESSIBLE_MAGIC 0x494e4143 /* "INAC" */ #endif /* __LINUX_MAGIC_H__ */ diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h index 7a8a26751c23..48750474b904 100644 --- a/include/uapi/linux/memfd.h +++ b/include/uapi/linux/memfd.h @@ -8,6 +8,7 @@ #define MFD_CLOEXEC 0x0001U #define MFD_ALLOW_SEALING 0x0002U #define MFD_HUGETLB 0x0004U +#define MFD_INACCESSIBLE 0x0008U /* * Huge page size encoding when MFD_HUGETLB is specified, and a huge page diff --git a/mm/Makefile b/mm/Makefile index 9a564f836403..f82e5d4b4388 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -126,7 +126,7 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_ZONE_DEVICE) += memremap.o obj-$(CONFIG_HMM_MIRROR) += hmm.o -obj-$(CONFIG_MEMFD_CREATE) += memfd.o +obj-$(CONFIG_MEMFD_CREATE) += memfd.o memfd_inaccessible.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o diff --git a/mm/memfd.c b/mm/memfd.c index 08f5f8304746..1853a90f49ff 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -261,7 +261,8 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | \ + MFD_INACCESSIBLE) SYSCALL_DEFINE2(memfd_create, const char __user *, uname, @@ -283,6 +284,14 @@ SYSCALL_DEFINE2(memfd_create, return -EINVAL; } + /* Disallow sealing when MFD_INACCESSIBLE is set. */ + if ((flags & MFD_INACCESSIBLE) && (flags & MFD_ALLOW_SEALING)) + return -EINVAL; + + /* TODO: add hugetlb support */ + if ((flags & MFD_INACCESSIBLE) && (flags & MFD_HUGETLB)) + return -EINVAL; + /* length includes terminating zero */ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); if (len <= 0) @@ -331,10 +340,24 @@ SYSCALL_DEFINE2(memfd_create, *file_seals &= ~F_SEAL_SEAL; } + if (flags & MFD_INACCESSIBLE) { + struct file *inaccessible_file; + + inaccessible_file = memfd_mkinaccessible(file); + if (IS_ERR(inaccessible_file)) { + error = PTR_ERR(inaccessible_file); + goto err_file; + } + + file = inaccessible_file; + } + fd_install(fd, file); kfree(name); return fd; +err_file: + fput(file); err_fd: put_unused_fd(fd); err_name: diff --git a/mm/memfd_inaccessible.c b/mm/memfd_inaccessible.c new file mode 100644 index 000000000000..89194438af9c --- /dev/null +++ b/mm/memfd_inaccessible.c @@ -0,0 +1,234 @@ +#include <linux/memfd.h> +#include <linux/pagemap.h> +#include <linux/pseudo_fs.h> +#include <linux/shmem_fs.h> +#include <uapi/linux/falloc.h> +#include <uapi/linux/magic.h> + +struct inaccessible_data { + struct rw_semaphore lock; + struct file *memfd; + struct list_head notifiers; +}; + +static void inaccessible_notifier_invalidate(struct inaccessible_data *data, + pgoff_t start, pgoff_t end) +{ + struct inaccessible_notifier *notifier; + + lockdep_assert_held(&data->lock); + VM_BUG_ON(!rwsem_is_locked(&data->lock)); + + list_for_each_entry(notifier, &data->notifiers, list) { + notifier->ops->invalidate(notifier, start, end); + } +} + +static int inaccessible_release(struct inode *inode, struct file *file) +{ + struct inaccessible_data *data = inode->i_mapping->private_data; + + fput(data->memfd); + kfree(data); + return 0; +} + +static long inaccessible_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + struct inaccessible_data *data = file->f_mapping->private_data; + struct file *memfd = data->memfd; + int ret; + + /* The lock prevents parallel inaccessible_get/put_pfn() */ + down_write(&data->lock); + if (mode & FALLOC_FL_PUNCH_HOLE) { + if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) { + ret = -EINVAL; + goto out; + } + } + + ret = memfd->f_op->fallocate(memfd, mode, offset, len); + inaccessible_notifier_invalidate(data, offset, offset + len); +out: + up_write(&data->lock); + return ret; +} + +static const struct file_operations inaccessible_fops = { + .release = inaccessible_release, + .fallocate = inaccessible_fallocate, +}; + +static int inaccessible_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + struct inaccessible_data *data = inode->i_mapping->private_data; + struct file *memfd = data->memfd; + + return memfd->f_inode->i_op->getattr(mnt_userns, path, stat, + request_mask, query_flags); +} + +static int inaccessible_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + struct inaccessible_data *data = inode->i_mapping->private_data; + struct file *memfd = data->memfd; + int ret; + + if (attr->ia_valid & ATTR_SIZE) { + if (memfd->f_inode->i_size) { + ret = -EPERM; + goto out; + } + + if (!PAGE_ALIGNED(attr->ia_size)) { + ret = -EINVAL; + goto out; + } + } + + ret = memfd->f_inode->i_op->setattr(mnt_userns, + file_dentry(memfd), attr); +out: + return ret; +} + +static const struct inode_operations inaccessible_iops = { + .getattr = inaccessible_getattr, + .setattr = inaccessible_setattr, +}; + +static int inaccessible_init_fs_context(struct fs_context *fc) +{ + if (!init_pseudo(fc, INACCESSIBLE_MAGIC)) + return -ENOMEM; + + fc->s_iflags |= SB_I_NOEXEC; + return 0; +} + +static struct file_system_type inaccessible_fs = { + .owner = THIS_MODULE, + .name = "[inaccessible]", + .init_fs_context = inaccessible_init_fs_context, + .kill_sb = kill_anon_super, +}; + +static struct vfsmount *inaccessible_mnt; + +static __init int inaccessible_init(void) +{ + inaccessible_mnt = kern_mount(&inaccessible_fs); + if (IS_ERR(inaccessible_mnt)) + return PTR_ERR(inaccessible_mnt); + return 0; +} +fs_initcall(inaccessible_init); + +struct file *memfd_mkinaccessible(struct file *memfd) +{ + struct inaccessible_data *data; + struct address_space *mapping; + struct inode *inode; + struct file *file; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + data->memfd = memfd; + init_rwsem(&data->lock); + INIT_LIST_HEAD(&data->notifiers); + + inode = alloc_anon_inode(inaccessible_mnt->mnt_sb); + if (IS_ERR(inode)) { + kfree(data); + return ERR_CAST(inode); + } + + inode->i_mode |= S_IFREG; + inode->i_op = &inaccessible_iops; + inode->i_mapping->private_data = data; + + file = alloc_file_pseudo(inode, inaccessible_mnt, + "[memfd:inaccessible]", O_RDWR, + &inaccessible_fops); + if (IS_ERR(file)) { + iput(inode); + kfree(data); + } + + mapping = memfd->f_mapping; + mapping_set_unevictable(mapping); + mapping_set_gfp_mask(mapping, + mapping_gfp_mask(mapping) & ~__GFP_MOVABLE); + + return file; +} + +int inaccessible_register_notifier(struct file *file, + struct inaccessible_notifier *notifier) +{ + struct inaccessible_data *data = file->f_mapping->private_data; + + down_write(&data->lock); + list_add(¬ifier->list, &data->notifiers); + up_write(&data->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(inaccessible_register_notifier); + +void inaccessible_unregister_notifier(struct file *file, + struct inaccessible_notifier *notifier) +{ + struct inaccessible_data *data = file->f_mapping->private_data; + + down_write(&data->lock); + list_del_rcu(¬ifier->list); + up_write(&data->lock); +} +EXPORT_SYMBOL_GPL(inaccessible_unregister_notifier); + +int inaccessible_get_pfn(struct file *file, pgoff_t offset, pfn_t *pfn, + int *order) +{ + struct inaccessible_data *data = file->f_mapping->private_data; + struct file *memfd = data->memfd; + struct page *page; + int ret; + + down_read(&data->lock); + + ret = shmem_getpage(file_inode(memfd), offset, &page, SGP_WRITE); + if (ret) { + up_read(&data->lock); + return ret; + } + + *pfn = page_to_pfn_t(page); + *order = thp_order(compound_head(page)); + return 0; +} +EXPORT_SYMBOL_GPL(inaccessible_get_pfn); + +void inaccessible_put_pfn(struct file *file, pfn_t pfn) +{ + struct page *page = pfn_t_to_page(pfn); + struct inaccessible_data *data = file->f_mapping->private_data; + + if (WARN_ON_ONCE(!page)) + return; + + SetPageUptodate(page); + unlock_page(page); + put_page(page); + up_read(&data->lock); +} +EXPORT_SYMBOL_GPL(inaccessible_put_pfn); -- Kiryl Shutsemau / Kirill A. Shutemov