From: Nadav Amit <namit@xxxxxxxxxx> In order to use userfaultfd with io-uring, there are two options for extensions: support userfaultfd ioctls or provide similar functionality through the "write" interface. The latter approach seems more compelling as it does not require io-uring changes, and keeps all the logic of userfaultfd where it should be. In addition it allows to provide asynchronous completions by performing the copying/zeroing in the faulting thread (which will be done in a later patch). This patch enhances the userfaultfd API to provide write interface to perform similar operations for copy/zero. The lower bits of the position (smaller than PAGE_SHIFT) are being used to encode the required operation: zero/copy/wake/write-protect. In the case of zeroing, the source data is ignored and only the length is being used to determine the size of the data that needs to be zeroed. Cc: Jens Axboe <axboe@xxxxxxxxx> Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx> Cc: Peter Xu <peterx@xxxxxxxxxx> Cc: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx> Cc: io-uring@xxxxxxxxxxxxxxx Cc: linux-fsdevel@xxxxxxxxxxxxxxx Cc: linux-kernel@xxxxxxxxxxxxxxx Cc: linux-mm@xxxxxxxxx Signed-off-by: Nadav Amit <namit@xxxxxxxxxx> --- fs/userfaultfd.c | 96 +++++++++++++++++++++++++++++++- include/uapi/linux/userfaultfd.h | 14 ++++- 2 files changed, 107 insertions(+), 3 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 7bbee2a00d37..eae6ac303951 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1140,6 +1140,34 @@ static __poll_t userfaultfd_poll(struct file *file, poll_table *wait) static const struct file_operations userfaultfd_fops; +/* Open-coded version of anon_inode_getfd() to setup FMODE_PWRITE */ +static int userfaultfd_getfd(const char *name, const struct file_operations *fops, + void *priv, int flags) +{ + int error, fd; + struct file *file; + + error = get_unused_fd_flags(flags); + if (error < 0) + return error; + fd = error; + + file = anon_inode_getfile(name, fops, priv, flags); + + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_put_unused_fd; + } + file->f_mode |= FMODE_PWRITE; + fd_install(fd, file); + + return fd; + +err_put_unused_fd: + put_unused_fd(fd); + return error; +} + static int resolve_userfault_fork(struct userfaultfd_ctx *ctx, struct userfaultfd_ctx *new, struct uffd_msg *msg) @@ -1161,7 +1189,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *ctx, task_unlock(current); } - fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, new, + fd = userfaultfd_getfd("[userfaultfd]", &userfaultfd_fops, new, O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS)); if (files != NULL) { @@ -1496,6 +1524,69 @@ static __always_inline int validate_range(struct mm_struct *mm, return 0; } +ssize_t userfaultfd_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct userfaultfd_wake_range range; + struct userfaultfd_ctx *ctx = file->private_data; + size_t len = iov_iter_count(from); + __u64 dst = iocb->ki_pos & PAGE_MASK; + unsigned long mode = iocb->ki_pos & ~PAGE_MASK; + bool zeropage; + __s64 ret; + + BUG_ON(len == 0); + + zeropage = mode & UFFDIO_WRITE_MODE_ZEROPAGE; + + ret = -EINVAL; + if (mode & ~(UFFDIO_WRITE_MODE_DONTWAKE | UFFDIO_WRITE_MODE_WP | + UFFDIO_WRITE_MODE_ZEROPAGE)) + goto out; + + mode = mode & (UFFDIO_WRITE_MODE_DONTWAKE | UFFDIO_WRITE_MODE_WP); + + /* + * Keep compatibility with zeropage ioctl, which does not allow + * write-protect and dontwake. + */ + if (zeropage && + (mode & (UFFDIO_WRITE_MODE_DONTWAKE | UFFDIO_WRITE_MODE_WP)) == + (UFFDIO_WRITE_MODE_DONTWAKE | UFFDIO_WRITE_MODE_WP)) + goto out; + + ret = -EAGAIN; + if (READ_ONCE(ctx->mmap_changing)) + goto out; + + ret = validate_range(ctx->mm, &dst, len); + if (ret) + goto out; + + if (mmget_not_zero(ctx->mm)) { + if (zeropage) + ret = mfill_zeropage(ctx->mm, dst, from, + &ctx->mmap_changing); + else + ret = mcopy_atomic(ctx->mm, dst, from, + &ctx->mmap_changing, mode); + mmput(ctx->mm); + } else { + return -ESRCH; + } + if (ret < 0) + goto out; + + /* len == 0 would wake all */ + range.len = ret; + if (!(mode & UFFDIO_COPY_MODE_DONTWAKE)) { + range.start = dst; + wake_userfault(ctx, &range); + } +out: + return ret; +} + static inline bool vma_can_userfault(struct vm_area_struct *vma, unsigned long vm_flags) { @@ -2197,6 +2288,7 @@ static const struct file_operations userfaultfd_fops = { .release = userfaultfd_release, .poll = userfaultfd_poll, .read_iter = userfaultfd_read_iter, + .write_iter = userfaultfd_write_iter, .unlocked_ioctl = userfaultfd_ioctl, .compat_ioctl = compat_ptr_ioctl, .llseek = noop_llseek, @@ -2248,7 +2340,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) ctx->files = get_files_struct(current); - fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx, + fd = userfaultfd_getfd("[userfaultfd]", &userfaultfd_fops, ctx, O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS)); if (fd < 0) { mmdrop(ctx->mm); diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 4eeba4235afe..943e50b41742 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -28,7 +28,8 @@ UFFD_FEATURE_MISSING_SHMEM | \ UFFD_FEATURE_SIGBUS | \ UFFD_FEATURE_THREAD_ID | \ - UFFD_FEATURE_POLL) + UFFD_FEATURE_POLL | \ + UFFD_FEATURE_WRITE) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ @@ -177,6 +178,9 @@ struct uffdio_api { * UFFD_FEATURE_POLL polls upon page-fault if the feature is requested * instead of descheduling. This feature should only be enabled for * low-latency handlers and when CPUs are not overcomitted. + * + * UFFD_FEATURE_WRITE allows to use the write interface for copy and + * zeroing of pages in addition to the ioctl interface. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -188,6 +192,7 @@ struct uffdio_api { #define UFFD_FEATURE_SIGBUS (1<<7) #define UFFD_FEATURE_THREAD_ID (1<<8) #define UFFD_FEATURE_POLL (1<<9) +#define UFFD_FEATURE_WRITE (1<<10) __u64 features; __u64 ioctls; @@ -264,4 +269,11 @@ struct uffdio_writeprotect { __u64 mode; }; +/* + * Write modes to be use with UFFDIO_SET_WRITE_MODE ioctl. + */ +#define UFFDIO_WRITE_MODE_DONTWAKE UFFDIO_COPY_MODE_DONTWAKE +#define UFFDIO_WRITE_MODE_WP UFFDIO_COPY_MODE_WP +#define UFFDIO_WRITE_MODE_ZEROPAGE ((__u64)1<<2) + #endif /* _LINUX_USERFAULTFD_H */ -- 2.25.1