Add a new io_uring opcode IORING_OP_SENDZC. The main distinction from IORING_OP_SEND is that the user should specify a notification slot index in sqe::notification_idx and the buffers are safe to reuse only when the used notification is flushed and completes. Signed-off-by: Pavel Begunkov <asml.silence@xxxxxxxxx> --- include/uapi/linux/io_uring.h | 5 +++ io_uring/net.c | 84 +++++++++++++++++++++++++++++++++++ io_uring/net.h | 4 ++ io_uring/opdef.c | 15 +++++++ 4 files changed, 108 insertions(+) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 9b7ea3e1018f..0e1e179cec1d 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -62,6 +62,10 @@ struct io_uring_sqe { union { __s32 splice_fd_in; __u32 file_index; + struct { + __u16 notification_idx; + __u16 __pad; + } __attribute__((packed)); }; union { struct { @@ -193,6 +197,7 @@ enum io_uring_op { IORING_OP_GETXATTR, IORING_OP_SOCKET, IORING_OP_URING_CMD, + IORING_OP_SENDZC, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/io_uring/net.c b/io_uring/net.c index d95c88d83f9f..ef492f1360c8 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -13,6 +13,7 @@ #include "io_uring.h" #include "kbuf.h" #include "net.h" +#include "notif.h" #if defined(CONFIG_NET) struct io_shutdown { @@ -58,6 +59,14 @@ struct io_sr_msg { unsigned int flags; }; +struct io_sendzc { + struct file *file; + void __user *buf; + size_t len; + u16 slot_idx; + int msg_flags; +}; + #define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED) int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -770,4 +779,79 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) io_req_set_res(req, ret, 0); return IOU_OK; } + +int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_sendzc *zc = io_kiocb_to_cmd(req); + + if (READ_ONCE(sqe->ioprio) || READ_ONCE(sqe->addr2) || READ_ONCE(sqe->__pad2[0])) + return -EINVAL; + + zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); + zc->len = READ_ONCE(sqe->len); + zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; + zc->slot_idx = READ_ONCE(sqe->notification_idx); + if (zc->msg_flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + zc->msg_flags |= MSG_CMSG_COMPAT; +#endif + return 0; +} + +int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_sendzc *zc = io_kiocb_to_cmd(req); + struct io_notif_slot *notif_slot; + struct io_notif *notif; + struct msghdr msg; + struct iovec iov; + struct socket *sock; + unsigned msg_flags; + int ret, min_ret = 0; + + if (issue_flags & IO_URING_F_UNLOCKED) + return -EAGAIN; + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + notif_slot = io_get_notif_slot(ctx, zc->slot_idx); + if (!notif_slot) + return -EINVAL; + notif = io_get_notif(ctx, notif_slot); + if (!notif) + return -ENOMEM; + + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + msg.msg_managed_data = 0; + + ret = import_single_range(WRITE, zc->buf, zc->len, &iov, &msg.msg_iter); + if (unlikely(ret)) + return ret; + + msg_flags = zc->msg_flags | MSG_ZEROCOPY; + if (issue_flags & IO_URING_F_NONBLOCK) + msg_flags |= MSG_DONTWAIT; + if (msg_flags & MSG_WAITALL) + min_ret = iov_iter_count(&msg.msg_iter); + + msg.msg_flags = msg_flags; + msg.msg_ubuf = ¬if->uarg; + ret = sock_sendmsg(sock, &msg); + + if (unlikely(ret < min_ret)) { + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) + return -EAGAIN; + return ret == -ERESTARTSYS ? -EINTR : ret; + } + + io_req_set_res(req, ret, 0); + return IOU_OK; +} #endif diff --git a/io_uring/net.h b/io_uring/net.h index 81d71d164770..1dba8befebb3 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -40,4 +40,8 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags); int io_connect_prep_async(struct io_kiocb *req); int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_connect(struct io_kiocb *req, unsigned int issue_flags); + +int io_sendzc(struct io_kiocb *req, unsigned int issue_flags); +int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); + #endif diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 0be00db9e31c..91d425b43174 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -466,6 +466,21 @@ const struct io_op_def io_op_defs[] = { .issue = io_uring_cmd, .prep_async = io_uring_cmd_prep_async, }, + [IORING_OP_SENDZC] = { + .name = "SENDZC", + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .audit_skip = 1, + .ioprio = 1, +#if defined(CONFIG_NET) + .prep = io_sendzc_prep, + .issue = io_sendzc, +#else + .prep = io_eopnotsupp_prep, +#endif + + }, }; const char *io_uring_get_opcode(u8 opcode) -- 2.36.1