On Mon, Feb 03, 2025 at 07:45:11AM -0800, Keith Busch wrote: > From: Keith Busch <kbusch@xxxxxxxxxx> > > This is a new look at supporting zero copy with ublk. > > The previous version from Ming can be viewed here: > > https://lore.kernel.org/linux-block/20241107110149.890530-1-ming.lei@xxxxxxxxxx/ > > Based on the feedback from that thread, the desired io_uring interfaces > needed to be simpler, and the kernel registered resources need to behave > more similiar to user registered buffers. > > This series introduces a new resource node type, KBUF, which, like the > BUFFER resource, needs to be installed into an io_uring buf_node table > in order for the user to access it in a fixed buffer command. The > new io_uring kernel API provides a way for a user to register a struct > request's bvec to a specific index, and a way to unregister it. > > When the ublk server receives notification of a new command, it must > first select an index and register the zero copy buffer. It may use that > index for any number of fixed buffer commands, then it must unregister > the index when it's done. This can all be done in a single io_uring_enter > if desired, or it can be split into multiple enters if needed. I suspect it may not be done in single io_uring_enter() because there is strict dependency among the three OPs(register buffer, read/write, unregister buffer). > > The io_uring instance that gets the zero copy registration doesn't > necessarily need to be the same ring that is receiving notifcations from > the ublk_drv module. This allows you to split frontend and backend rings > if desired. > > At the end of this cover letter, I've provided a patch to the ublksrv to > demonstrate how to use this. > > Jens Axboe (1): > io_uring: use node for import > > Keith Busch (5): > block: const blk_rq_nr_phys_segments request > io_uring: add support for kernel registered bvecs > ublk: zc register/unregister bvec > io_uring: add abstraction for buf_table rsrc data > io_uring: cache nodes and mapped buffers > > drivers/block/ublk_drv.c | 139 +++++++++++++----- > include/linux/blk-mq.h | 2 +- > include/linux/io_uring.h | 1 + > include/linux/io_uring_types.h | 25 +++- > include/uapi/linux/ublk_cmd.h | 4 + > io_uring/fdinfo.c | 8 +- > io_uring/filetable.c | 2 +- > io_uring/net.c | 5 +- > io_uring/nop.c | 2 +- > io_uring/register.c | 2 +- > io_uring/rsrc.c | 259 ++++++++++++++++++++++++++------- > io_uring/rsrc.h | 8 +- > io_uring/rw.c | 4 +- > io_uring/uring_cmd.c | 4 +- > 14 files changed, 351 insertions(+), 114 deletions(-) > > -- > 2.43.5 > > ublksrv: > > https://github.com/ublk-org/ublksrv > > --- > include/ublk_cmd.h | 4 +++ > include/ublksrv_tgt.h | 13 ++++++++ > lib/ublksrv.c | 9 ++++++ > tgt_loop.cpp | 74 +++++++++++++++++++++++++++++++++++++++++-- > ublksrv_tgt.cpp | 2 +- > 5 files changed, 99 insertions(+), 3 deletions(-) > > diff --git a/include/ublk_cmd.h b/include/ublk_cmd.h > index 0150003..07439be 100644 > --- a/include/ublk_cmd.h > +++ b/include/ublk_cmd.h > @@ -94,6 +94,10 @@ > _IOWR('u', UBLK_IO_COMMIT_AND_FETCH_REQ, struct ublksrv_io_cmd) > #define UBLK_U_IO_NEED_GET_DATA \ > _IOWR('u', UBLK_IO_NEED_GET_DATA, struct ublksrv_io_cmd) > +#define UBLK_U_IO_REGISTER_IO_BUF \ > + _IOWR('u', 0x23, struct ublksrv_io_cmd) > +#define UBLK_U_IO_UNREGISTER_IO_BUF \ > + _IOWR('u', 0x24, struct ublksrv_io_cmd) > > /* only ABORT means that no re-fetch */ > #define UBLK_IO_RES_OK 0 > diff --git a/include/ublksrv_tgt.h b/include/ublksrv_tgt.h > index 1deee2b..6291531 100644 > --- a/include/ublksrv_tgt.h > +++ b/include/ublksrv_tgt.h > @@ -189,4 +189,17 @@ static inline void ublk_get_sqe_pair(struct io_uring *r, > *sqe2 = io_uring_get_sqe(r); > } > > +static inline void ublk_get_sqe_three(struct io_uring *r, > + struct io_uring_sqe **sqe1, struct io_uring_sqe **sqe2, > + struct io_uring_sqe **sqe3) > +{ > + unsigned left = io_uring_sq_space_left(r); > + > + if (left < 3) > + io_uring_submit(r); > + > + *sqe1 = io_uring_get_sqe(r); > + *sqe2 = io_uring_get_sqe(r); > + *sqe3 = io_uring_get_sqe(r); > +} > #endif > diff --git a/lib/ublksrv.c b/lib/ublksrv.c > index 16a9e13..7205247 100644 > --- a/lib/ublksrv.c > +++ b/lib/ublksrv.c > @@ -619,6 +619,15 @@ skip_alloc_buf: > goto fail; > } > > + if (ctrl_dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) { > + ret = io_uring_register_buffers_sparse(&q->ring, q->q_depth); > + if (ret) { > + ublk_err("ublk dev %d queue %d register spare buffers failed %d", > + q->dev->ctrl_dev->dev_info.dev_id, q->q_id, ret); > + goto fail; > + } > + } > + > io_uring_register_ring_fd(&q->ring); > > /* > diff --git a/tgt_loop.cpp b/tgt_loop.cpp > index 0f16676..ce44c7d 100644 > --- a/tgt_loop.cpp > +++ b/tgt_loop.cpp > @@ -246,12 +246,62 @@ static inline int loop_fallocate_mode(const struct ublksrv_io_desc *iod) > return mode; > } > > +static inline void io_uring_prep_buf_register(struct io_uring_sqe *sqe, > + int dev_fd, int tag, int q_id, __u64 index) > +{ > + struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd; > + > + io_uring_prep_read(sqe, dev_fd, 0, 0, 0); > + sqe->opcode = IORING_OP_URING_CMD; > + sqe->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_FIXED_FILE; IOSQE_IO_LINK is missed, because the following buffer consumer OP has to be issued after this buffer register OP is completed. > + sqe->cmd_op = UBLK_U_IO_REGISTER_IO_BUF; > + > + cmd->tag = tag; > + cmd->addr = index; > + cmd->q_id = q_id; > +} > + > +static inline void io_uring_prep_buf_unregister(struct io_uring_sqe *sqe, > + int dev_fd, int tag, int q_id, __u64 index) > +{ > + struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd; > + > + io_uring_prep_read(sqe, dev_fd, 0, 0, 0); > + sqe->opcode = IORING_OP_URING_CMD; > + sqe->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_FIXED_FILE; > + sqe->cmd_op = UBLK_U_IO_UNREGISTER_IO_BUF; IOSQE_IO_LINK is missed, because buffer un-register OP has to be issued after the previous buffer consumer OP is completed. > + > + cmd->tag = tag; > + cmd->addr = index; > + cmd->q_id = q_id; > +} > + > static void loop_queue_tgt_read(const struct ublksrv_queue *q, > const struct ublksrv_io_desc *iod, int tag) > { > + const struct ublksrv_ctrl_dev_info *info = > + ublksrv_ctrl_get_dev_info(ublksrv_get_ctrl_dev(q->dev)); > unsigned ublk_op = ublksrv_get_op(iod); > > - if (user_copy) { > + if (info->flags & UBLK_F_SUPPORT_ZERO_COPY) { > + struct io_uring_sqe *reg; > + struct io_uring_sqe *read; > + struct io_uring_sqe *ureg; > + > + ublk_get_sqe_three(q->ring_ptr, ®, &read, &ureg); > + > + io_uring_prep_buf_register(reg, 0, tag, q->q_id, tag); > + > + io_uring_prep_read_fixed(read, 1 /*fds[1]*/, > + 0, > + iod->nr_sectors << 9, > + iod->start_sector << 9, > + tag); > + io_uring_sqe_set_flags(read, IOSQE_FIXED_FILE); > + read->user_data = build_user_data(tag, ublk_op, 0, 1); Does this interface support to read to partial buffer? Which is useful for stacking device cases. Also does this interface support to consume the buffer from multiple OPs concurrently? Thanks, Ming