From: Keith Busch <kbusch@xxxxxxxxxx> This is a new look at supporting zero copy with ublk. The previous version from Ming can be viewed here: https://lore.kernel.org/linux-block/20241107110149.890530-1-ming.lei@xxxxxxxxxx/ Based on the feedback from that thread, the desired io_uring interfaces needed to be simpler, and the kernel registered resources need to behave more similiar to user registered buffers. This series introduces a new resource node type, KBUF, which, like the BUFFER resource, needs to be installed into an io_uring buf_node table in order for the user to access it in a fixed buffer command. The new io_uring kernel API provides a way for a user to register a struct request's bvec to a specific index, and a way to unregister it. When the ublk server receives notification of a new command, it must first select an index and register the zero copy buffer. It may use that index for any number of fixed buffer commands, then it must unregister the index when it's done. This can all be done in a single io_uring_enter if desired, or it can be split into multiple enters if needed. The io_uring instance that gets the zero copy registration doesn't necessarily need to be the same ring that is receiving notifcations from the ublk_drv module. This allows you to split frontend and backend rings if desired. At the end of this cover letter, I've provided a patch to the ublksrv to demonstrate how to use this. Jens Axboe (1): io_uring: use node for import Keith Busch (5): block: const blk_rq_nr_phys_segments request io_uring: add support for kernel registered bvecs ublk: zc register/unregister bvec io_uring: add abstraction for buf_table rsrc data io_uring: cache nodes and mapped buffers drivers/block/ublk_drv.c | 139 +++++++++++++----- include/linux/blk-mq.h | 2 +- include/linux/io_uring.h | 1 + include/linux/io_uring_types.h | 25 +++- include/uapi/linux/ublk_cmd.h | 4 + io_uring/fdinfo.c | 8 +- io_uring/filetable.c | 2 +- io_uring/net.c | 5 +- io_uring/nop.c | 2 +- io_uring/register.c | 2 +- io_uring/rsrc.c | 259 ++++++++++++++++++++++++++------- io_uring/rsrc.h | 8 +- io_uring/rw.c | 4 +- io_uring/uring_cmd.c | 4 +- 14 files changed, 351 insertions(+), 114 deletions(-) -- 2.43.5 ublksrv: https://github.com/ublk-org/ublksrv --- include/ublk_cmd.h | 4 +++ include/ublksrv_tgt.h | 13 ++++++++ lib/ublksrv.c | 9 ++++++ tgt_loop.cpp | 74 +++++++++++++++++++++++++++++++++++++++++-- ublksrv_tgt.cpp | 2 +- 5 files changed, 99 insertions(+), 3 deletions(-) diff --git a/include/ublk_cmd.h b/include/ublk_cmd.h index 0150003..07439be 100644 --- a/include/ublk_cmd.h +++ b/include/ublk_cmd.h @@ -94,6 +94,10 @@ _IOWR('u', UBLK_IO_COMMIT_AND_FETCH_REQ, struct ublksrv_io_cmd) #define UBLK_U_IO_NEED_GET_DATA \ _IOWR('u', UBLK_IO_NEED_GET_DATA, struct ublksrv_io_cmd) +#define UBLK_U_IO_REGISTER_IO_BUF \ + _IOWR('u', 0x23, struct ublksrv_io_cmd) +#define UBLK_U_IO_UNREGISTER_IO_BUF \ + _IOWR('u', 0x24, struct ublksrv_io_cmd) /* only ABORT means that no re-fetch */ #define UBLK_IO_RES_OK 0 diff --git a/include/ublksrv_tgt.h b/include/ublksrv_tgt.h index 1deee2b..6291531 100644 --- a/include/ublksrv_tgt.h +++ b/include/ublksrv_tgt.h @@ -189,4 +189,17 @@ static inline void ublk_get_sqe_pair(struct io_uring *r, *sqe2 = io_uring_get_sqe(r); } +static inline void ublk_get_sqe_three(struct io_uring *r, + struct io_uring_sqe **sqe1, struct io_uring_sqe **sqe2, + struct io_uring_sqe **sqe3) +{ + unsigned left = io_uring_sq_space_left(r); + + if (left < 3) + io_uring_submit(r); + + *sqe1 = io_uring_get_sqe(r); + *sqe2 = io_uring_get_sqe(r); + *sqe3 = io_uring_get_sqe(r); +} #endif diff --git a/lib/ublksrv.c b/lib/ublksrv.c index 16a9e13..7205247 100644 --- a/lib/ublksrv.c +++ b/lib/ublksrv.c @@ -619,6 +619,15 @@ skip_alloc_buf: goto fail; } + if (ctrl_dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) { + ret = io_uring_register_buffers_sparse(&q->ring, q->q_depth); + if (ret) { + ublk_err("ublk dev %d queue %d register spare buffers failed %d", + q->dev->ctrl_dev->dev_info.dev_id, q->q_id, ret); + goto fail; + } + } + io_uring_register_ring_fd(&q->ring); /* diff --git a/tgt_loop.cpp b/tgt_loop.cpp index 0f16676..ce44c7d 100644 --- a/tgt_loop.cpp +++ b/tgt_loop.cpp @@ -246,12 +246,62 @@ static inline int loop_fallocate_mode(const struct ublksrv_io_desc *iod) return mode; } +static inline void io_uring_prep_buf_register(struct io_uring_sqe *sqe, + int dev_fd, int tag, int q_id, __u64 index) +{ + struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd; + + io_uring_prep_read(sqe, dev_fd, 0, 0, 0); + sqe->opcode = IORING_OP_URING_CMD; + sqe->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_FIXED_FILE; + sqe->cmd_op = UBLK_U_IO_REGISTER_IO_BUF; + + cmd->tag = tag; + cmd->addr = index; + cmd->q_id = q_id; +} + +static inline void io_uring_prep_buf_unregister(struct io_uring_sqe *sqe, + int dev_fd, int tag, int q_id, __u64 index) +{ + struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd; + + io_uring_prep_read(sqe, dev_fd, 0, 0, 0); + sqe->opcode = IORING_OP_URING_CMD; + sqe->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_FIXED_FILE; + sqe->cmd_op = UBLK_U_IO_UNREGISTER_IO_BUF; + + cmd->tag = tag; + cmd->addr = index; + cmd->q_id = q_id; +} + static void loop_queue_tgt_read(const struct ublksrv_queue *q, const struct ublksrv_io_desc *iod, int tag) { + const struct ublksrv_ctrl_dev_info *info = + ublksrv_ctrl_get_dev_info(ublksrv_get_ctrl_dev(q->dev)); unsigned ublk_op = ublksrv_get_op(iod); - if (user_copy) { + if (info->flags & UBLK_F_SUPPORT_ZERO_COPY) { + struct io_uring_sqe *reg; + struct io_uring_sqe *read; + struct io_uring_sqe *ureg; + + ublk_get_sqe_three(q->ring_ptr, ®, &read, &ureg); + + io_uring_prep_buf_register(reg, 0, tag, q->q_id, tag); + + io_uring_prep_read_fixed(read, 1 /*fds[1]*/, + 0, + iod->nr_sectors << 9, + iod->start_sector << 9, + tag); + io_uring_sqe_set_flags(read, IOSQE_FIXED_FILE); + read->user_data = build_user_data(tag, ublk_op, 0, 1); + + io_uring_prep_buf_unregister(ureg, 0, tag, q->q_id, tag); + } else if (user_copy) { struct io_uring_sqe *sqe, *sqe2; __u64 pos = ublk_pos(q->q_id, tag, 0); void *buf = ublksrv_queue_get_io_buf(q, tag); @@ -286,9 +336,29 @@ static void loop_queue_tgt_read(const struct ublksrv_queue *q, static void loop_queue_tgt_write(const struct ublksrv_queue *q, const struct ublksrv_io_desc *iod, int tag) { + const struct ublksrv_ctrl_dev_info *info = + ublksrv_ctrl_get_dev_info(ublksrv_get_ctrl_dev(q->dev)); unsigned ublk_op = ublksrv_get_op(iod); - if (user_copy) { + if (info->flags & UBLK_F_SUPPORT_ZERO_COPY) { + struct io_uring_sqe *reg; + struct io_uring_sqe *write; + struct io_uring_sqe *ureg; + + ublk_get_sqe_three(q->ring_ptr, ®, &write, &ureg); + + io_uring_prep_buf_register(reg, 0, tag, q->q_id, tag); + + io_uring_prep_write_fixed(write, 1 /*fds[1]*/, + 0, + iod->nr_sectors << 9, + iod->start_sector << 9, + tag); + io_uring_sqe_set_flags(write, IOSQE_FIXED_FILE); + write->user_data = build_user_data(tag, ublk_op, 0, 1); + + io_uring_prep_buf_unregister(ureg, 0, tag, q->q_id, tag); + } else if (user_copy) { struct io_uring_sqe *sqe, *sqe2; __u64 pos = ublk_pos(q->q_id, tag, 0); void *buf = ublksrv_queue_get_io_buf(q, tag); diff --git a/ublksrv_tgt.cpp b/ublksrv_tgt.cpp index 8f9cf28..f3ebe14 100644 --- a/ublksrv_tgt.cpp +++ b/ublksrv_tgt.cpp @@ -723,7 +723,7 @@ static int cmd_dev_add(int argc, char *argv[]) data.tgt_type = optarg; break; case 'z': - data.flags |= UBLK_F_SUPPORT_ZERO_COPY; + data.flags |= UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY; break; case 'q': data.nr_hw_queues = strtol(optarg, NULL, 10); -- 2.43.5