This patch adds the capability of passing integrity metadata along with read/write. A new ext_cap (extended_capability) field is introduced in SQE which indicates the type of extra information being sent. A new 'struct io_uring_sqe_ext' represents the secondary SQE space for read/write. In future if another extension needs to be added, then one needs to: 1. Add extra fields in the sqe/secondary-sqe 2. Introduce a ext_cap flag indicating additional values that have been passed The last 32 bytes of secondary SQE is used to pass following PI related information: - flags: integrity check flags namely IO_INTEGRITY_CHK_{GUARD/APPTAG/REFTAG} - len: length of the pi/metadata buffer - buf: address of the metadata buffer - seed: seed value for reftag remapping - app_tag: application defined 16b value Application sets up a SQE128 ring, prepares PI information within the second SQE and sets the ext_cap field to EXT_CAP_PI. The patch processes this information to prepare uio_meta descriptor and passes it down using kiocb->private. Meta exchange is supported only for direct IO. Also vectored read/write operations with meta are not supported currently. Signed-off-by: Anuj Gupta <anuj20.g@xxxxxxxxxxx> Signed-off-by: Kanchan Joshi <joshi.k@xxxxxxxxxxx> --- include/uapi/linux/io_uring.h | 34 ++++++++++++++ io_uring/io_uring.c | 8 ++++ io_uring/rw.c | 88 ++++++++++++++++++++++++++++++++++- io_uring/rw.h | 14 +++++- 4 files changed, 141 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 56cf30b49ef5..449e7627b1b5 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -92,6 +92,11 @@ struct io_uring_sqe { __u16 addr_len; __u16 __pad3[1]; }; + struct { + /* flags indicating additional information being passed */ + __u16 ext_cap; + __u16 __pad4[1]; + }; }; union { struct { @@ -107,6 +112,35 @@ struct io_uring_sqe { }; }; +enum io_uring_sqe_ext_cap_bits { + EXT_CAP_PI_BIT, + /* + * not a real extended capability; just to make sure that we don't + * overflow + */ + EXT_CAP_LAST_BIT, +}; + +/* extended capability flags */ +#define EXT_CAP_PI (1U << EXT_CAP_PI_BIT) + +/* Second half of SQE128 for IORING_OP_READ/WRITE */ +struct io_uring_sqe_ext { + __u64 rsvd0[4]; + /* if sqe->ext_cap is EXT_CAP_PI, last 32 bytes are for PI */ + union { + __u64 rsvd1[4]; + struct { + __u16 flags; + __u16 app_tag; + __u32 len; + __u64 addr; + __u64 seed; + __u64 rsvd; + } rw_pi; + }; +}; + /* * If sqe->file_index is set to this for opcodes that instantiate a new * direct descriptor (like openat/openat2/accept), then io_uring will allocate diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b590e50f09e7..6e582fe93bc4 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -4165,7 +4165,9 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); BUILD_BUG_SQE_ELEM(44, __u32, file_index); BUILD_BUG_SQE_ELEM(44, __u16, addr_len); + BUILD_BUG_SQE_ELEM(44, __u16, ext_cap); BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]); + BUILD_BUG_SQE_ELEM(46, __u16, __pad4[0]); BUILD_BUG_SQE_ELEM(48, __u64, addr3); BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd); BUILD_BUG_SQE_ELEM(56, __u64, __pad2); @@ -4192,6 +4194,12 @@ static int __init io_uring_init(void) /* top 8bits are for internal use */ BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0); + BUILD_BUG_ON(sizeof(struct io_uring_sqe_ext) != + sizeof(struct io_uring_sqe)); + + BUILD_BUG_ON(EXT_CAP_LAST_BIT > + 8 * sizeof_field(struct io_uring_sqe, ext_cap)); + io_uring_optable_init(); /* diff --git a/io_uring/rw.c b/io_uring/rw.c index 768a908ca2a8..e60bf0ed4c4f 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -257,11 +257,64 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) return 0; } +static inline void io_meta_save_state(struct io_async_rw *io) +{ + io->meta_state.seed = io->meta.seed; + iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta); +} + +static inline void io_meta_restore(struct io_async_rw *io) +{ + io->meta.seed = io->meta_state.seed; + iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta); +} + +static inline const void *io_uring_sqe_ext(const struct io_uring_sqe *sqe) +{ + return (sqe + 1); +} + +static int io_prep_rw_pi(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_rw *rw, int ddir) +{ + const struct io_uring_sqe_ext *sqe_ext; + const struct io_issue_def *def; + struct io_async_rw *io; + int ret; + + if (!(req->ctx->flags & IORING_SETUP_SQE128)) + return -EINVAL; + + sqe_ext = io_uring_sqe_ext(sqe); + if (READ_ONCE(sqe_ext->rsvd0[0]) || READ_ONCE(sqe_ext->rsvd0[1]) + || READ_ONCE(sqe_ext->rsvd0[2]) || READ_ONCE(sqe_ext->rsvd0[3])) + return -EINVAL; + if (READ_ONCE(sqe_ext->rw_pi.rsvd)) + return -EINVAL; + + def = &io_issue_defs[req->opcode]; + if (def->vectored) + return -EOPNOTSUPP; + + io = req->async_data; + io->meta.flags = READ_ONCE(sqe_ext->rw_pi.flags); + io->meta.app_tag = READ_ONCE(sqe_ext->rw_pi.app_tag); + io->meta.seed = READ_ONCE(sqe_ext->rw_pi.seed); + ret = import_ubuf(ddir, u64_to_user_ptr(READ_ONCE(sqe_ext->rw_pi.addr)), + READ_ONCE(sqe_ext->rw_pi.len), &io->meta.iter); + if (unlikely(ret < 0)) + return ret; + rw->kiocb.ki_flags |= IOCB_HAS_METADATA; + io_meta_save_state(io); + return ret; +} + static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, int ddir, bool do_import) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsigned ioprio; + u16 ext_cap; int ret; rw->kiocb.ki_pos = READ_ONCE(sqe->off); @@ -279,11 +332,23 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, rw->kiocb.ki_ioprio = get_current_ioprio(); } rw->kiocb.dio_complete = NULL; + rw->kiocb.ki_flags = 0; rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); rw->flags = READ_ONCE(sqe->rw_flags); - return io_prep_rw_setup(req, ddir, do_import); + ret = io_prep_rw_setup(req, ddir, do_import); + + if (unlikely(ret)) + return ret; + + ext_cap = READ_ONCE(sqe->ext_cap); + if (ext_cap) { + if (READ_ONCE(sqe->__pad4[0]) || !(ext_cap & EXT_CAP_PI)) + return -EINVAL; + ret = io_prep_rw_pi(req, sqe, rw, ddir); + } + return ret; } int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -410,7 +475,10 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) static void io_resubmit_prep(struct io_kiocb *req) { struct io_async_rw *io = req->async_data; + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + if (rw->kiocb.ki_flags & IOCB_HAS_METADATA) + io_meta_restore(io); iov_iter_restore(&io->iter, &io->iter_state); } @@ -795,7 +863,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (!(req->flags & REQ_F_FIXED_FILE)) req->flags |= io_file_get_flags(file); - kiocb->ki_flags = file->f_iocb_flags; + kiocb->ki_flags |= file->f_iocb_flags; ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type); if (unlikely(ret)) return ret; @@ -829,6 +897,18 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) kiocb->ki_complete = io_complete_rw; } + if (kiocb->ki_flags & IOCB_HAS_METADATA) { + struct io_async_rw *io = req->async_data; + + /* + * We have a union of meta fields with wpq used for buffered-io + * in io_async_rw, so fail it here. + */ + if (!(req->file->f_flags & O_DIRECT)) + return -EOPNOTSUPP; + kiocb->private = &io->meta; + } + return 0; } @@ -903,6 +983,8 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) * manually if we need to. */ iov_iter_restore(&io->iter, &io->iter_state); + if (kiocb->ki_flags & IOCB_HAS_METADATA) + io_meta_restore(io); do { /* @@ -1126,6 +1208,8 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) } else { ret_eagain: iov_iter_restore(&io->iter, &io->iter_state); + if (kiocb->ki_flags & IOCB_HAS_METADATA) + io_meta_restore(io); if (kiocb->ki_flags & IOCB_WRITE) io_req_end_write(req); return -EAGAIN; diff --git a/io_uring/rw.h b/io_uring/rw.h index 3f432dc75441..2d7656bd268d 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -2,6 +2,11 @@ #include <linux/pagemap.h> +struct io_meta_state { + u32 seed; + struct iov_iter_state iter_meta; +}; + struct io_async_rw { size_t bytes_done; struct iov_iter iter; @@ -9,7 +14,14 @@ struct io_async_rw { struct iovec fast_iov; struct iovec *free_iovec; int free_iov_nr; - struct wait_page_queue wpq; + /* wpq is for buffered io, while meta fields are used with direct io */ + union { + struct wait_page_queue wpq; + struct { + struct uio_meta meta; + struct io_meta_state meta_state; + }; + }; }; int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); -- 2.25.1