From: Anuj Gupta <anuj20.g@xxxxxxxxxxx> This patch introduces IORING_OP_READ_META and IORING_OP_WRITE_META opcodes which allow sending a meta buffer along with read/write. Application can do that by using the newly added meta_buf and meta-len fields of the SQE. These opcodes are supported only for direct IO. Signed-off-by: Anuj Gupta <anuj20.g@xxxxxxxxxxx> Signed-off-by: Kanchan Joshi <joshi.k@xxxxxxxxxxx> Signed-off-by: Nitesh Shetty <nj.shetty@xxxxxxxxxxx> --- include/linux/fs.h | 1 + include/uapi/linux/io_uring.h | 6 +++ io_uring/io_uring.c | 2 + io_uring/opdef.c | 29 ++++++++++++ io_uring/rw.c | 86 +++++++++++++++++++++++++++++++++-- io_uring/rw.h | 8 ++++ 6 files changed, 129 insertions(+), 3 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 0a22b7245982..c3a483a4fdac 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -327,6 +327,7 @@ struct readahead_control; #define IOCB_NOIO (1 << 20) /* can use bio alloc cache */ #define IOCB_ALLOC_CACHE (1 << 21) +#define IOCB_USE_META (1 << 22) /* * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the * iocb completion can be passed back to the owner for execution from a safe diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 7bd10201a02b..87bd44098037 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -97,6 +97,10 @@ struct io_uring_sqe { __u64 addr3; __u64 __pad2[1]; }; + struct { + __u64 meta_addr; + __u32 meta_len; + }; __u64 optval; /* * If the ring is initialized with IORING_SETUP_SQE128, then @@ -256,6 +260,8 @@ enum io_uring_op { IORING_OP_FUTEX_WAITV, IORING_OP_FIXED_FD_INSTALL, IORING_OP_FTRUNCATE, + IORING_OP_READ_META, + IORING_OP_WRITE_META, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 49a124daa359..7c380cac4465 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -4134,7 +4134,9 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(44, __u16, addr_len); BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]); BUILD_BUG_SQE_ELEM(48, __u64, addr3); + BUILD_BUG_SQE_ELEM(48, __u64, meta_addr); BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd); + BUILD_BUG_SQE_ELEM(56, __u32, meta_len); BUILD_BUG_SQE_ELEM(56, __u64, __pad2); BUILD_BUG_ON(sizeof(struct io_uring_files_update) != diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 9c080aadc5a6..cb31573ac4ad 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -146,6 +146,26 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_eopnotsupp_prep, #endif }, + [IORING_OP_READ_META] = { + .needs_file = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .iopoll_queue = 1, + .prep = io_prep_rw_meta, + .issue = io_rw_meta, + }, + [IORING_OP_WRITE_META] = { + .needs_file = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .iopoll_queue = 1, + .prep = io_prep_rw_meta, + .issue = io_rw_meta, + }, [IORING_OP_RECVMSG] = { .needs_file = 1, .unbound_nonreg_file = 1, @@ -501,6 +521,15 @@ const struct io_cold_def io_cold_defs[] = { .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, + [IORING_OP_READ_META] = { + .async_size = sizeof(struct io_async_rw), + .name = "READ_META", + .fail = io_rw_fail, + }, + [IORING_OP_WRITE_META] = { + .async_size = sizeof(struct io_async_rw), + .name = "WRITE_META", + }, [IORING_OP_FSYNC] = { .name = "FSYNC", }, diff --git a/io_uring/rw.c b/io_uring/rw.c index 40f6c2a59928..87a6304052f0 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -27,6 +27,7 @@ struct io_rw { struct kiocb kiocb; u64 addr; u32 len; + u32 meta_len; }; static inline bool io_file_supports_nowait(struct io_kiocb *req) @@ -107,6 +108,22 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } +int io_prep_rw_meta(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + struct kiocb *kiocb = &rw->kiocb; + int ret; + + ret = io_prep_rw(req, sqe); + if (unlikely(ret)) + return ret; + kiocb->private = u64_to_user_ptr(READ_ONCE(sqe->meta_addr)); + rw->meta_len = READ_ONCE(sqe->meta_len); + + kiocb->ki_flags |= IOCB_USE_META; + return 0; +} + int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe) { int ret; @@ -571,9 +588,18 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, } } +static inline void io_req_map_meta(struct io_async_rw *iorw, struct io_rw_state_meta *sm) +{ + memcpy(&iorw->s_meta.iter_meta, &sm->iter_meta, sizeof(struct iov_iter)); + iov_iter_save_state(&iorw->s_meta.iter_meta, &iorw->s_meta.iter_state_meta); +} + static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, struct io_rw_state *s, bool force) { + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + struct kiocb *kiocb = &rw->kiocb; + if (!force && !io_cold_defs[req->opcode].prep_async) return 0; /* opcode type doesn't need async data */ @@ -591,6 +617,11 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, iorw = req->async_data; /* we've copied and mapped the iter, ensure state is saved */ iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state); + if (unlikely(kiocb->ki_flags & IOCB_USE_META)) { + struct io_rw_state_meta *sm = kiocb->private; + + io_req_map_meta(iorw, sm); + } } return 0; } @@ -747,7 +778,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) return -EOPNOTSUPP; - kiocb->private = NULL; + if (likely(!(kiocb->ki_flags & IOCB_USE_META))) + kiocb->private = NULL; kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; req->iopoll_completed = 0; @@ -766,6 +798,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) struct io_rw_state __s, *s = &__s; struct iovec *iovec; struct kiocb *kiocb = &rw->kiocb; + struct io_rw_state_meta *sm = kiocb->private; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; struct io_async_rw *io; ssize_t ret, ret2; @@ -840,13 +873,16 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) /* no retry on NONBLOCK nor RWF_NOWAIT */ if (req->flags & REQ_F_NOWAIT) goto done; + if (kiocb->ki_flags & IOCB_USE_META) + kiocb->private = sm; ret = 0; } else if (ret == -EIOCBQUEUED) { if (iovec) kfree(iovec); return IOU_ISSUE_SKIP_COMPLETE; } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || - (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) { + (req->flags & REQ_F_NOWAIT) || !need_complete_io(req) || + (kiocb->ki_flags & IOCB_USE_META)) { /* read all, failed, already did sync or don't want to retry */ goto done; } @@ -857,6 +893,12 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) * manually if we need to. */ iov_iter_restore(&s->iter, &s->iter_state); + if (unlikely(kiocb->ki_flags & IOCB_USE_META)) { + /* don't handle partial completion for read + meta */ + if (ret > 0) + goto done; + iov_iter_restore(&sm->iter_meta, &sm->iter_state_meta); + } ret2 = io_setup_async_rw(req, iovec, s, true); iovec = NULL; @@ -1070,7 +1112,8 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) goto copy_iov; - if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) { + if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req) + && !(kiocb->ki_flags & IOCB_USE_META)) { struct io_async_rw *io; trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2, @@ -1111,6 +1154,43 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) return ret; } +int io_rw_meta(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + void __user *meta_addr = u64_to_user_ptr((u64)rw->kiocb.private); + struct io_rw_state_meta __sm, *sm = &__sm; + struct kiocb *kiocb = &rw->kiocb; + int ret; + + if (!(req->file->f_flags & O_DIRECT)) + return -EOPNOTSUPP; + /* prepare iter for meta-buffer */ + if (!req_has_async_data(req)) { + ret = import_ubuf(ITER_SOURCE, meta_addr, rw->meta_len, &sm->iter_meta); + iov_iter_save_state(&sm->iter_meta, &sm->iter_state_meta); + if (unlikely(ret < 0)) + return ret; + } else { + struct io_async_rw *io = req->async_data; + + sm = &io->s_meta; + iov_iter_restore(&sm->iter_meta, &sm->iter_state_meta); + } + /* Store iter for meta-buf in private, will be used later*/ + kiocb->private = sm; + if (req->opcode == IORING_OP_READ_META) { + ret = __io_read(req, issue_flags); + if (ret >= 0) + return kiocb_done(req, ret, issue_flags); + } else { + ret = io_write(req, issue_flags); + } + if (ret == -EAGAIN) + kiocb->private = meta_addr; + return ret; + +} + void io_rw_fail(struct io_kiocb *req) { int res; diff --git a/io_uring/rw.h b/io_uring/rw.h index f9e89b4fe4da..7c12216776bc 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -8,19 +8,27 @@ struct io_rw_state { struct iovec fast_iov[UIO_FASTIOV]; }; +struct io_rw_state_meta { + struct iov_iter iter_meta; + struct iov_iter_state iter_state_meta; +}; + struct io_async_rw { struct io_rw_state s; + struct io_rw_state_meta s_meta; const struct iovec *free_iovec; size_t bytes_done; struct wait_page_queue wpq; }; int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_rw_meta(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read(struct io_kiocb *req, unsigned int issue_flags); int io_readv_prep_async(struct io_kiocb *req); int io_write(struct io_kiocb *req, unsigned int issue_flags); +int io_rw_meta(struct io_kiocb *req, unsigned int issue_flags); int io_writev_prep_async(struct io_kiocb *req); void io_readv_writev_cleanup(struct io_kiocb *req); void io_rw_fail(struct io_kiocb *req); -- 2.25.1