On Mon, Mar 3, 2025 at 3:01 PM Caleb Sander Mateos <csander@xxxxxxxxxxxxxxx> wrote: > > On Mon, Mar 3, 2025 at 7:50 AM Pavel Begunkov <asml.silence@xxxxxxxxx> wrote: > > > > Implement registered buffer vectored reads with new opcodes > > IORING_OP_WRITEV_FIXED and IORING_OP_READV_FIXED. > > > > Signed-off-by: Pavel Begunkov <asml.silence@xxxxxxxxx> > > --- > > include/uapi/linux/io_uring.h | 2 ++ > > io_uring/opdef.c | 39 +++++++++++++++++++++++++++ > > io_uring/rw.c | 51 +++++++++++++++++++++++++++++++++++ > > io_uring/rw.h | 2 ++ > > 4 files changed, 94 insertions(+) > > > > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > > index 1e02e94bc26d..9dd384b369ee 100644 > > --- a/include/uapi/linux/io_uring.h > > +++ b/include/uapi/linux/io_uring.h > > @@ -280,6 +280,8 @@ enum io_uring_op { > > IORING_OP_BIND, > > IORING_OP_LISTEN, > > IORING_OP_RECV_ZC, > > + IORING_OP_READV_FIXED, > > + IORING_OP_WRITEV_FIXED, > > > > /* this goes last, obviously */ > > IORING_OP_LAST, > > diff --git a/io_uring/opdef.c b/io_uring/opdef.c > > index 9511262c513e..6655d2cbf74d 100644 > > --- a/io_uring/opdef.c > > +++ b/io_uring/opdef.c > > @@ -529,6 +529,35 @@ const struct io_issue_def io_issue_defs[] = { > > .prep = io_eopnotsupp_prep, > > #endif > > }, > > + [IORING_OP_READV_FIXED] = { > > + .needs_file = 1, > > + .unbound_nonreg_file = 1, > > + .pollin = 1, > > + .plug = 1, > > + .audit_skip = 1, > > + .ioprio = 1, > > + .iopoll = 1, > > + .iopoll_queue = 1, > > + .vectored = 1, > > + .async_size = sizeof(struct io_async_rw), > > + .prep = io_prep_readv_fixed, > > + .issue = io_read, > > + }, > > + [IORING_OP_WRITEV_FIXED] = { > > + .needs_file = 1, > > + .hash_reg_file = 1, > > + .unbound_nonreg_file = 1, > > + .pollout = 1, > > + .plug = 1, > > + .audit_skip = 1, > > + .ioprio = 1, > > + .iopoll = 1, > > + .iopoll_queue = 1, > > + .vectored = 1, > > + .async_size = sizeof(struct io_async_rw), > > + .prep = io_prep_writev_fixed, > > + .issue = io_write, > > + }, > > }; > > > > const struct io_cold_def io_cold_defs[] = { > > @@ -761,6 +790,16 @@ const struct io_cold_def io_cold_defs[] = { > > [IORING_OP_RECV_ZC] = { > > .name = "RECV_ZC", > > }, > > + [IORING_OP_READV_FIXED] = { > > + .name = "READV_FIXED", > > + .cleanup = io_readv_writev_cleanup, > > + .fail = io_rw_fail, > > + }, > > + [IORING_OP_WRITEV_FIXED] = { > > + .name = "WRITEV_FIXED", > > + .cleanup = io_readv_writev_cleanup, > > + .fail = io_rw_fail, > > + }, > > }; > > > > const char *io_uring_get_opcode(u8 opcode) > > diff --git a/io_uring/rw.c b/io_uring/rw.c > > index ad7f647d48e9..4c4229f41aaa 100644 > > --- a/io_uring/rw.c > > +++ b/io_uring/rw.c > > @@ -381,6 +381,57 @@ int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) > > return __io_prep_rw(req, sqe, ITER_SOURCE); > > } > > > > +static int io_rw_prep_reg_vec(struct io_kiocb *req, int ddir) > > +{ > > + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); > > + struct io_async_rw *io = req->async_data; > > + const struct iovec __user *uvec; > > + size_t uvec_segs = rw->len; > > + struct iovec *iov; > > + int iovec_off, ret; > > + void *res; > > + > > + if (uvec_segs > io->vec.nr) { > > + ret = io_vec_realloc(&io->vec, uvec_segs); > > + if (ret) > > + return ret; > > + req->flags |= REQ_F_NEED_CLEANUP; > > + } > > + /* pad iovec to the right */ > > + iovec_off = io->vec.nr - uvec_segs; > > + iov = io->vec.iovec + iovec_off; > > + uvec = u64_to_user_ptr(rw->addr); > > + res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, > > + io_is_compat(req->ctx)); > > + if (IS_ERR(res)) > > + return PTR_ERR(res); > > + > > + ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec, > > + uvec_segs, iovec_off, 0); > > So the iovecs are being imported at prep time rather than issue time? > I suppose since only user registered buffers are allowed and not > kernel bvecs, you aren't concerned about interactions with the ublk > bvec register/unregister operations? I think in principle the > difference between prep and issue time is still observable if the same > registered buffer index is being used alternately for user and kernel > registered buffers. Never mind, I see you change this in the next patch. Best, Caleb