Added new READV_PI/WRITEV_PI operations to io_uring. Added new pi_addr & pi_len fields to SQE struct. Added new pi_iter field and IOCB_USE_PI flag to kiocb struct. Make corresponding corrections to io uring trace event. Signed-off-by: Alexander V. Buev <a.buev@xxxxxxxxx> --- fs/io_uring.c | 209 ++++++++++++++++++++++++++++++++ include/linux/fs.h | 2 + include/trace/events/io_uring.h | 17 +-- include/uapi/linux/io_uring.h | 6 +- include/uapi/linux/uio.h | 3 +- 5 files changed, 228 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2e04f718319d..6e941040f228 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -563,6 +563,19 @@ struct io_rw { u64 len; }; +struct io_rw_pi_state { + struct iov_iter iter; + struct iov_iter_state iter_state; + struct iovec fast_iov[UIO_FASTIOV_PI]; +}; + +struct io_rw_pi { + struct io_rw rw; + struct iovec *pi_iov; + u32 nr_pi_segs; + struct io_rw_pi_state *s; +}; + struct io_connect { struct file *file; struct sockaddr __user *addr; @@ -716,6 +729,12 @@ struct io_async_rw { struct wait_page_queue wpq; }; +struct io_async_rw_pi { + struct io_async_rw async; + const struct iovec *free_iovec; + struct io_rw_pi_state s; +}; + enum { REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, @@ -744,6 +763,7 @@ enum { /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, + REQ_F_USE_PI_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -799,6 +819,8 @@ enum { REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), /* don't post CQEs while failing linked requests */ REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), + /* pi metadata present */ + REQ_F_USE_PI = BIT(REQ_F_USE_PI_BIT) }; struct async_poll { @@ -855,6 +877,7 @@ struct io_kiocb { struct io_mkdir mkdir; struct io_symlink symlink; struct io_hardlink hardlink; + struct io_rw_pi rw_pi; }; u8 opcode; @@ -1105,6 +1128,24 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_MKDIRAT] = {}, [IORING_OP_SYMLINKAT] = {}, [IORING_OP_LINKAT] = {}, + [IORING_OP_READV_PI] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, + .needs_async_setup = 1, + .plug = 1, + .async_size = sizeof(struct io_async_rw_pi), + }, + [IORING_OP_WRITEV_PI] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .needs_async_setup = 1, + .plug = 1, + .async_size = sizeof(struct io_async_rw_pi), + }, }; /* requests with any of those set should undergo io_disarm_next() */ @@ -3053,6 +3094,18 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } +static int io_prep_rw_pi(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + if (!(req->file->f_flags & O_DIRECT)) + return -EINVAL; + + req->rw.kiocb.ki_flags |= IOCB_USE_PI; + req->flags |= REQ_F_USE_PI; + req->rw_pi.pi_iov = u64_to_user_ptr(READ_ONCE(sqe->pi_addr)); + req->rw_pi.nr_pi_segs = READ_ONCE(sqe->pi_len); + return 0; +} + static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) { switch (ret) { @@ -3505,10 +3558,39 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, iorw = req->async_data; /* we've copied and mapped the iter, ensure state is saved */ iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state); + if (req->flags & REQ_F_USE_PI) { + struct io_async_rw_pi *iorw_pi = req->async_data; + + /* copy iter from req to async ctx */ + iorw_pi->s.iter = req->rw_pi.s->iter; + + if (req->rw_pi.s->iter.iov == req->rw_pi.s->fast_iov) { + memcpy(iorw_pi->s.fast_iov, req->rw_pi.s->fast_iov, + sizeof(iorw_pi->s.fast_iov)); + iorw_pi->s.iter.iov = iorw_pi->s.fast_iov; + iorw_pi->free_iovec = 0; + } else { + req->flags |= REQ_F_NEED_CLEANUP; + iorw_pi->free_iovec = req->rw_pi.s->iter.iov; + } + + iov_iter_save_state(&iorw_pi->s.iter, &iorw_pi->s.iter_state); + } } return 0; } +static inline +int io_import_pi_iovec(struct io_kiocb *req, int rw, unsigned int fast_segs, + struct iovec **fast_iov, struct iov_iter *iter) +{ + void __user *buf = req->rw_pi.pi_iov; + + return __import_iovec(rw, buf, req->rw_pi.nr_pi_segs, fast_segs, + fast_iov, iter, req->ctx->compat); +} + + static inline int io_rw_prep_async(struct io_kiocb *req, int rw) { struct io_async_rw *iorw = req->async_data; @@ -3527,6 +3609,25 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw) return 0; } +static inline int io_rw_prep_async_pi(struct io_kiocb *req, int rw) +{ + int ret = 0; + struct io_async_rw_pi *iorw_pi = req->async_data; + struct iovec *pi_iov = iorw_pi->s.fast_iov; + + ret = io_import_pi_iovec(req, rw, UIO_FASTIOV_PI, &pi_iov, &iorw_pi->s.iter); + if (unlikely(ret < 0)) + return ret; + + iorw_pi->free_iovec = pi_iov; + + if (pi_iov) + req->flags |= REQ_F_NEED_CLEANUP; + iov_iter_save_state(&iorw_pi->s.iter, &iorw_pi->s.iter_state); + + return io_rw_prep_async(req, rw); +} + static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { if (unlikely(!(req->file->f_mode & FMODE_READ))) @@ -3534,6 +3635,15 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_prep_rw(req, sqe); } +static int io_read_prep_pi(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + int ret = io_read_prep(req, sqe); + + if (ret) + return ret; + return io_prep_rw_pi(req, sqe); +} + /* * This is our waitqueue callback handler, registered through __folio_lock_async() * when we initially tried to do the IO with the iocb armed our waitqueue. @@ -3690,6 +3800,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) * manually if we need to. */ iov_iter_restore(&s->iter, &s->iter_state); + if (req->flags & REQ_F_USE_PI) + iov_iter_restore(kiocb->pi_iter, &req->rw_pi.s->iter_state); + ret2 = io_setup_async_rw(req, iovec, s, true); if (ret2) @@ -3714,6 +3827,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) break; rw->bytes_done += ret; iov_iter_save_state(&s->iter, &s->iter_state); + if (req->flags & REQ_F_USE_PI) + iov_iter_save_state(kiocb->pi_iter, &req->rw_pi.s->iter_state); /* if we can retry, do so with the callbacks armed */ if (!io_rw_should_retry(req)) { @@ -3733,6 +3848,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) /* we got some bytes, but not all. retry. */ kiocb->ki_flags &= ~IOCB_WAITQ; iov_iter_restore(&s->iter, &s->iter_state); + if (req->flags & REQ_F_USE_PI) + iov_iter_restore(kiocb->pi_iter, &req->rw_pi.s->iter_state); } while (ret > 0); done: kiocb_done(req, ret, issue_flags); @@ -3743,6 +3860,34 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static int io_read_pi(struct io_kiocb *req, unsigned int issue_flags) +{ + if (req_has_async_data(req)) { + struct io_async_rw_pi *iorw_pi = req->async_data; + + iov_iter_restore(&iorw_pi->s.iter, &iorw_pi->s.iter_state); + req->rw.kiocb.pi_iter = &iorw_pi->s.iter; + req->rw_pi.s = &iorw_pi->s; + return io_read(req, issue_flags); + } else { + int ret; + struct io_rw_pi_state __s, *s = &__s; + struct iovec *pi_iov = __s.fast_iov; + + ret = io_import_pi_iovec(req, READ, UIO_FASTIOV_PI, &pi_iov, &s->iter); + if (unlikely(ret < 0)) + return ret; + iov_iter_save_state(&s->iter, &s->iter_state); + req->rw.kiocb.pi_iter = &s->iter; + req->rw_pi.s = s; + + ret = io_read(req, issue_flags); + if (pi_iov && !(ret == -EAGAIN && req_has_async_data(req))) + kfree(pi_iov); + return ret; + } +} + static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { if (unlikely(!(req->file->f_mode & FMODE_WRITE))) @@ -3751,6 +3896,15 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_prep_rw(req, sqe); } +static int io_write_prep_pi(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + int ret = io_write_prep(req, sqe); + + if (ret) + return ret; + return io_prep_rw_pi(req, sqe); +} + static int io_write(struct io_kiocb *req, unsigned int issue_flags) { struct io_rw_state __s, *s = &__s; @@ -3836,6 +3990,10 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) } else { copy_iov: iov_iter_restore(&s->iter, &s->iter_state); + + if (req->flags & REQ_F_USE_PI) + iov_iter_restore(kiocb->pi_iter, &req->rw_pi.s->iter_state); + ret = io_setup_async_rw(req, iovec, s, false); return ret ?: -EAGAIN; } @@ -3846,6 +4004,34 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) return ret; } +static int io_write_pi(struct io_kiocb *req, unsigned int issue_flags) +{ + if (req_has_async_data(req)) { + struct io_async_rw_pi *iorw_pi = req->async_data; + + req->rw.kiocb.pi_iter = &iorw_pi->s.iter; + req->rw_pi.s = &iorw_pi->s; + iov_iter_restore(&iorw_pi->s.iter, &iorw_pi->s.iter_state); + return io_write(req, issue_flags); + } else { + int ret; + struct io_rw_pi_state __s, *s = &__s; + struct iovec *pi_iov = __s.fast_iov; + + ret = io_import_pi_iovec(req, WRITE, UIO_FASTIOV_PI, &pi_iov, &s->iter); + if (unlikely(ret < 0)) + return ret; + iov_iter_save_state(&s->iter, &s->iter_state); + req->rw.kiocb.pi_iter = &s->iter; + req->rw_pi.s = s; + + ret = io_write(req, issue_flags); + if (pi_iov && !(ret == -EAGAIN && req_has_async_data(req))) + kfree(pi_iov); + return ret; + } +} + static int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -6500,10 +6686,14 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) case IORING_OP_READ_FIXED: case IORING_OP_READ: return io_read_prep(req, sqe); + case IORING_OP_READV_PI: + return io_read_prep_pi(req, sqe); case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE: return io_write_prep(req, sqe); + case IORING_OP_WRITEV_PI: + return io_write_prep_pi(req, sqe); case IORING_OP_POLL_ADD: return io_poll_add_prep(req, sqe); case IORING_OP_POLL_REMOVE: @@ -6589,6 +6779,10 @@ static int io_req_prep_async(struct io_kiocb *req) return io_rw_prep_async(req, READ); case IORING_OP_WRITEV: return io_rw_prep_async(req, WRITE); + case IORING_OP_READV_PI: + return io_rw_prep_async_pi(req, READ); + case IORING_OP_WRITEV_PI: + return io_rw_prep_async_pi(req, WRITE); case IORING_OP_SENDMSG: return io_sendmsg_prep_async(req); case IORING_OP_RECVMSG: @@ -6670,7 +6864,14 @@ static void io_clean_op(struct io_kiocb *req) case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE: { struct io_async_rw *io = req->async_data; + kfree(io->free_iovec); + break; + } + case IORING_OP_READV_PI: + case IORING_OP_WRITEV_PI: { + struct io_async_rw_pi *io = req->async_data; + kfree(io->async.free_iovec); kfree(io->free_iovec); break; } @@ -6750,11 +6951,17 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) case IORING_OP_READ: ret = io_read(req, issue_flags); break; + case IORING_OP_READV_PI: + ret = io_read_pi(req, issue_flags); + break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE: ret = io_write(req, issue_flags); break; + case IORING_OP_WRITEV_PI: + ret = io_write_pi(req, issue_flags); + break; case IORING_OP_FSYNC: ret = io_fsync(req, issue_flags); break; @@ -11218,6 +11425,8 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(42, __u16, personality); BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); BUILD_BUG_SQE_ELEM(44, __u32, file_index); + BUILD_BUG_SQE_ELEM(48, __u64, pi_addr); + BUILD_BUG_SQE_ELEM(56, __u32, pi_len); BUILD_BUG_ON(sizeof(struct io_uring_files_update) != sizeof(struct io_uring_rsrc_update)); diff --git a/include/linux/fs.h b/include/linux/fs.h index e2d892b201b0..c45ec5073300 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -316,6 +316,7 @@ enum rw_hint { #define IOCB_NOIO (1 << 20) /* can use bio alloc cache */ #define IOCB_ALLOC_CACHE (1 << 21) +#define IOCB_USE_PI (1 << 22) struct kiocb { struct file *ki_filp; @@ -330,6 +331,7 @@ struct kiocb { u16 ki_hint; u16 ki_ioprio; /* See linux/ioprio.h */ struct wait_page_queue *ki_waitq; /* for async buffered IO */ + struct iov_iter *pi_iter; randomized_struct_fields_end }; diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 7346f0164cf4..8a435df796b9 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -524,8 +524,9 @@ TRACE_EVENT(io_uring_req_failed, __field( u16, buf_index ) __field( u16, personality ) __field( u32, file_index ) - __field( u64, pad1 ) - __field( u64, pad2 ) + __field(u64, pi_addr) + __field(u32, pi_len) + __field(u32, pad) __field( int, error ) ), @@ -541,21 +542,23 @@ TRACE_EVENT(io_uring_req_failed, __entry->buf_index = sqe->buf_index; __entry->personality = sqe->personality; __entry->file_index = sqe->file_index; - __entry->pad1 = sqe->__pad2[0]; - __entry->pad2 = sqe->__pad2[1]; + __entry->pi_addr = sqe->pi_addr; + __entry->pi_len = sqe->pi_len; + __entry->pad = sqe->__pad2; __entry->error = error; ), TP_printk("op %d, flags=0x%x, prio=%d, off=%llu, addr=%llu, " "len=%u, rw_flags=0x%x, user_data=0x%llx, buf_index=%d, " - "personality=%d, file_index=%d, pad=0x%llx/%llx, error=%d", + "personality=%d, file_index=%d, pi_addr=0x%llx, pi_len=%u, " + "pad=%u, error=%d", __entry->opcode, __entry->flags, __entry->ioprio, (unsigned long long)__entry->off, (unsigned long long) __entry->addr, __entry->len, __entry->op_flags, (unsigned long long) __entry->user_data, __entry->buf_index, __entry->personality, __entry->file_index, - (unsigned long long) __entry->pad1, - (unsigned long long) __entry->pad2, __entry->error) + (unsigned long long) __entry->pi_addr, __entry->pi_len, + __entry->pad, __entry->error) ); #endif /* _TRACE_IO_URING_H */ diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 787f491f0d2a..87ea512c2c8d 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -60,7 +60,9 @@ struct io_uring_sqe { __s32 splice_fd_in; __u32 file_index; }; - __u64 __pad2[2]; + __u64 pi_addr; /* pointer to iovec */ + __u32 pi_len; + __u32 __pad2; }; enum { @@ -143,6 +145,8 @@ enum { IORING_OP_MKDIRAT, IORING_OP_SYMLINKAT, IORING_OP_LINKAT, + IORING_OP_READV_PI, + IORING_OP_WRITEV_PI, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/include/uapi/linux/uio.h b/include/uapi/linux/uio.h index 059b1a9147f4..c9eaaa6cdb0f 100644 --- a/include/uapi/linux/uio.h +++ b/include/uapi/linux/uio.h @@ -23,9 +23,10 @@ struct iovec /* * UIO_MAXIOV shall be at least 16 1003.1g (5.4.1.1) */ - + #define UIO_FASTIOV 8 #define UIO_MAXIOV 1024 +#define UIO_FASTIOV_PI 1 #endif /* _UAPI__LINUX_UIO_H */ -- 2.34.1