On 05/01/2021 23:00, arni@xxxxxxxx wrote: > From: Árni Dagur <arni@xxxxxxxx> > > * The `sqe->splice_flags` field is used to hold flags. > * We return -EAGAIN if force_nonblock is set. > > Signed-off-by: Árni Dagur <arni@xxxxxxxx> > --- > fs/io_uring.c | 76 +++++++++++++++++++++++++++++++++++ > include/uapi/linux/io_uring.h | 1 + > 2 files changed, 77 insertions(+) > > diff --git a/fs/io_uring.c b/fs/io_uring.c > index ca46f314640b..a99a89798386 100644 > --- a/fs/io_uring.c > +++ b/fs/io_uring.c > @@ -531,6 +531,13 @@ struct io_splice { > unsigned int flags; > }; > > +struct io_vmsplice { > + struct file *file; > + u64 addr; > + u64 len; > + unsigned int flags; > +}; > + > struct io_provide_buf { > struct file *file; > __u64 addr; > @@ -692,6 +699,7 @@ struct io_kiocb { > struct io_madvise madvise; > struct io_epoll epoll; > struct io_splice splice; > + struct io_vmsplice vmsplice; > struct io_provide_buf pbuf; > struct io_statx statx; > struct io_shutdown shutdown; > @@ -967,6 +975,12 @@ static const struct io_op_def io_op_defs[] = { > .unbound_nonreg_file = 1, > .work_flags = IO_WQ_WORK_BLKCG, > }, > + [IORING_OP_VMSPLICE] = { > + .needs_file = 1, > + .hash_reg_file = 1, > + .unbound_nonreg_file = 1, > + .work_flags = IO_WQ_WORK_MM, > + }, > [IORING_OP_PROVIDE_BUFFERS] = {}, > [IORING_OP_REMOVE_BUFFERS] = {}, > [IORING_OP_TEE] = { > @@ -3884,6 +3898,63 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock) > return 0; > } > > +static int io_vmsplice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > +{ > + struct io_vmsplice *sp = &req->vmsplice; > + > + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) > + return -EINVAL; > + if (unlikely(READ_ONCE(sqe->off))) > + return -EINVAL; > + > + sp->addr = READ_ONCE(sqe->addr); > + sp->len = READ_ONCE(sqe->len); > + sp->flags = READ_ONCE(sqe->splice_flags); > + > + if (sp->flags & ~SPLICE_F_ALL) > + return -EINVAL; > + > + return 0; > +} > + > +static int io_vmsplice(struct io_kiocb *req, bool force_nonblock) > +{ > + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; > + struct io_vmsplice *sp = &req->vmsplice; > + void __user *buf = u64_to_user_ptr(sp->addr); const struct iovec __user *uiov > + struct iov_iter __iter, *iter = &__iter; read/write either use ((struct io_async_rw *)req->async_data)->iter or to avoid allocation use an on-stack iter. This only has that on-stack __iter, so why do you need *iter? > + struct file *file = sp->file; > + ssize_t io_size; > + int type, ret; > + > + if (force_nonblock) > + return -EAGAIN; > + > + if (file->f_mode & FMODE_WRITE) > + type = WRITE; > + else if (file->f_mode & FMODE_READ) > + type = READ; > + else { > + ret = -EBADF; > + goto err; it jumps to kfree(iovec), where iovec=inline_vecs > + } > + > + ret = __import_iovec(type, buf, sp->len, UIO_FASTIOV, &iovec, iter, > + req->ctx->compat); This may happen asynchronously long after io_uring_enter(submit) returned, e.g. if a user keeps uiov on-stack it will fail or read garbage. So, it's either to make it a part of ABI -- users must not delete uiov until the request completion, or copy it while not-yet-async. For consistency with read/write I'd prefer the second. > + if (ret < 0) > + goto err; > + io_size = iov_iter_count(iter); > + > + ret = do_vmsplice(file, iter, sp->flags); > + if (ret != io_size) { > +err: > + req_set_fail_links(req); > + } > + io_req_complete(req, ret); > + kfree(iovec); > + return 0; > +} > + > /* > * IORING_OP_NOP just posts a completion event, nothing else. > */ > @@ -6009,6 +6080,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > return io_epoll_ctl_prep(req, sqe); > case IORING_OP_SPLICE: > return io_splice_prep(req, sqe); > + case IORING_OP_VMSPLICE: > + return io_vmsplice_prep(req, sqe); > case IORING_OP_PROVIDE_BUFFERS: > return io_provide_buffers_prep(req, sqe); > case IORING_OP_REMOVE_BUFFERS: > @@ -6262,6 +6335,9 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, > case IORING_OP_SPLICE: > ret = io_splice(req, force_nonblock); > break; > + case IORING_OP_VMSPLICE: > + ret = io_vmsplice(req, force_nonblock); > + break; > case IORING_OP_PROVIDE_BUFFERS: > ret = io_provide_buffers(req, force_nonblock, cs); > break; > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > index d31a2a1e8ef9..6bc79f9bb123 100644 > --- a/include/uapi/linux/io_uring.h > +++ b/include/uapi/linux/io_uring.h > @@ -137,6 +137,7 @@ enum { > IORING_OP_SHUTDOWN, > IORING_OP_RENAMEAT, > IORING_OP_UNLINKAT, > + IORING_OP_VMSPLICE, > > /* this goes last, obviously */ > IORING_OP_LAST, > -- Pavel Begunkov