On Mon, Feb 10, 2025 at 04:56:44PM -0800, Keith Busch wrote: > From: Keith Busch <kbusch@xxxxxxxxxx> > > Provide new operations for the user to request mapping an active request > to an io uring instance's buf_table. The user has to provide the index > it wants to install the buffer. > > A reference count is taken on the request to ensure it can't be > completed while it is active in a ring's buf_table. > > Signed-off-by: Keith Busch <kbusch@xxxxxxxxxx> > --- > drivers/block/ublk_drv.c | 145 +++++++++++++++++++++++++--------- > include/uapi/linux/ublk_cmd.h | 4 + > 2 files changed, 113 insertions(+), 36 deletions(-) > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c > index 529085181f355..ccfda7b2c24da 100644 > --- a/drivers/block/ublk_drv.c > +++ b/drivers/block/ublk_drv.c > @@ -51,6 +51,9 @@ > /* private ioctl command mirror */ > #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) > > +#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF) > +#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF) UBLK_IO_REGISTER_IO_BUF command may be completed, and buffer isn't used by RW_FIXED yet in the following cases: - application doesn't submit any RW_FIXED consumer OP - io_uring_enter() only issued UBLK_IO_REGISTER_IO_BUF, and the other OPs can't be issued because of out of resource ... Then io_uring_enter() returns, and the application is panic or killed, how to avoid buffer leak? It need to deal with in io_uring cancel code for calling ->release() if the kbuffer node isn't released. UBLK_IO_UNREGISTER_IO_BUF still need to call ->release() if the node buffer isn't used. > + > /* All UBLK_F_* have to be included into UBLK_F_ALL */ > #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ > | UBLK_F_URING_CMD_COMP_IN_TASK \ > @@ -76,6 +79,9 @@ struct ublk_rq_data { > struct llist_node node; > > struct kref ref; > + > +#define UBLK_ZC_REGISTERED 0 > + unsigned long flags; > }; > > struct ublk_uring_cmd_pdu { > @@ -201,7 +207,7 @@ static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, > int tag); > static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub) > { > - return ub->dev_info.flags & UBLK_F_USER_COPY; > + return ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); > } > > static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) > @@ -581,7 +587,7 @@ static void ublk_apply_params(struct ublk_device *ub) > > static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) > { > - return ubq->flags & UBLK_F_USER_COPY; > + return ubq->flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); > } > > static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) > @@ -1747,6 +1753,102 @@ static inline void ublk_prep_cancel(struct io_uring_cmd *cmd, > io_uring_cmd_mark_cancelable(cmd, issue_flags); > } > > +static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, > + struct ublk_queue *ubq, int tag, size_t offset) > +{ > + struct request *req; > + > + if (!ublk_need_req_ref(ubq)) > + return NULL; > + > + req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); > + if (!req) > + return NULL; > + > + if (!ublk_get_req_ref(ubq, req)) > + return NULL; > + > + if (unlikely(!blk_mq_request_started(req) || req->tag != tag)) > + goto fail_put; > + > + if (!ublk_rq_has_data(req)) > + goto fail_put; > + > + if (offset > blk_rq_bytes(req)) > + goto fail_put; > + > + return req; > +fail_put: > + ublk_put_req_ref(ubq, req); > + return NULL; > +} > + > +static void ublk_io_release(void *priv) > +{ > + struct request *rq = priv; > + struct ublk_queue *ubq = rq->mq_hctx->driver_data; > + > + ublk_put_req_ref(ubq, rq); > +} It isn't enough to just get & put request reference here between registering buffer and freeing the registered node buf, because the same reference can be dropped from ublk_commit_completion() which is from queueing UBLK_IO_COMMIT_AND_FETCH_REQ, and buggy app may queue this command multiple times for freeing the request. One solution is to not allow request completion until the ->release() is returned. Thanks, Ming