On Tue, Apr 02, 2024 at 02:18:22PM -0600, Jens Axboe wrote: > Rather than use the older style ->read() hook, use ->read_iter() so that > userfaultfd can support both O_NONBLOCK and IOCB_NOWAIT for non-blocking > read attempts. > > Split the fd setup into two parts, so that userfaultfd can mark the file > mode with FMODE_NOWAIT before installing it into the process table. With > that, we can also defer grabbing the mm until we know the rest will > succeed, as the fd isn't visible before then. > > Signed-off-by: Jens Axboe <axboe@xxxxxxxxx> > --- > fs/userfaultfd.c | 42 ++++++++++++++++++++++++++---------------- > 1 file changed, 26 insertions(+), 16 deletions(-) > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > index 60dcfafdc11a..7864c2dba858 100644 > --- a/fs/userfaultfd.c > +++ b/fs/userfaultfd.c > @@ -282,7 +282,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, > /* > * Verify the pagetables are still not ok after having reigstered into > * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any > - * userfault that has already been resolved, if userfaultfd_read and > + * userfault that has already been resolved, if userfaultfd_read_iter and > * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different > * threads. > */ > @@ -1177,34 +1177,34 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, > return ret; > } > > -static ssize_t userfaultfd_read(struct file *file, char __user *buf, > - size_t count, loff_t *ppos) > +static ssize_t userfaultfd_read_iter(struct kiocb *iocb, struct iov_iter *to) > { > + struct file *file = iocb->ki_filp; > struct userfaultfd_ctx *ctx = file->private_data; > ssize_t _ret, ret = 0; > struct uffd_msg msg; > - int no_wait = file->f_flags & O_NONBLOCK; > struct inode *inode = file_inode(file); > + bool no_wait; > > if (!userfaultfd_is_initialized(ctx)) > return -EINVAL; > > + no_wait = file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT; > for (;;) { > - if (count < sizeof(msg)) > + if (iov_iter_count(to) < sizeof(msg)) > return ret ? ret : -EINVAL; > _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode); > if (_ret < 0) > return ret ? ret : _ret; > - if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg))) > + _ret = copy_to_iter(&msg, sizeof(msg), to); > + if (_ret < 0) > return ret ? ret : -EFAULT; > ret += sizeof(msg); > - buf += sizeof(msg); > - count -= sizeof(msg); > /* > * Allow to read more than one fault at time but only > * block if waiting for the very first one. > */ > - no_wait = O_NONBLOCK; > + no_wait = true; > } > } > > @@ -2172,7 +2172,7 @@ static const struct file_operations userfaultfd_fops = { > #endif > .release = userfaultfd_release, > .poll = userfaultfd_poll, > - .read = userfaultfd_read, > + .read_iter = userfaultfd_read_iter, > .unlocked_ioctl = userfaultfd_ioctl, > .compat_ioctl = compat_ptr_ioctl, > .llseek = noop_llseek, > @@ -2192,6 +2192,7 @@ static void init_once_userfaultfd_ctx(void *mem) > static int new_userfaultfd(int flags) > { > struct userfaultfd_ctx *ctx; > + struct file *file; > int fd; > > BUG_ON(!current->mm); > @@ -2215,16 +2216,25 @@ static int new_userfaultfd(int flags) > init_rwsem(&ctx->map_changing_lock); > atomic_set(&ctx->mmap_changing, 0); > ctx->mm = current->mm; > - /* prevent the mm struct to be freed */ > - mmgrab(ctx->mm); > + > + fd = get_unused_fd_flags(O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS)); > + if (fd < 0) > + goto err_out; > > /* Create a new inode so that the LSM can block the creation. */ > - fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, ctx, > + file = anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx, > O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); > - if (fd < 0) { > - mmdrop(ctx->mm); > - kmem_cache_free(userfaultfd_ctx_cachep, ctx); > + if (IS_ERR(file)) { > + fd = PTR_ERR(file); > + goto err_out; You're leaking the fd you allocated above. > } > + /* prevent the mm struct to be freed */ > + mmgrab(ctx->mm); > + file->f_mode |= FMODE_NOWAIT; > + fd_install(fd, file); > + return fd; > +err_out: > + kmem_cache_free(userfaultfd_ctx_cachep, ctx); > return fd; > } > > -- > 2.43.0 >