On 5/5/22 10:17 AM, Jens Axboe wrote: > On 5/5/22 12:06 AM, Kanchan Joshi wrote: >> +static int io_uring_cmd_prep(struct io_kiocb *req, >> + const struct io_uring_sqe *sqe) >> +{ >> + struct io_uring_cmd *ioucmd = &req->uring_cmd; >> + struct io_ring_ctx *ctx = req->ctx; >> + >> + if (ctx->flags & IORING_SETUP_IOPOLL) >> + return -EOPNOTSUPP; >> + /* do not support uring-cmd without big SQE/CQE */ >> + if (!(ctx->flags & IORING_SETUP_SQE128)) >> + return -EOPNOTSUPP; >> + if (!(ctx->flags & IORING_SETUP_CQE32)) >> + return -EOPNOTSUPP; >> + if (sqe->ioprio || sqe->rw_flags) >> + return -EINVAL; >> + ioucmd->cmd = sqe->cmd; >> + ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); >> + return 0; >> +} > > While looking at the other suggested changes, I noticed a more > fundamental issue with the passthrough support. For any other command, > SQE contents are stable once prep has been done. The above does do that > for the basic items, but this case is special as the lower level command > itself resides in the SQE. > > For cases where the command needs deferral, it's problematic. There are > two main cases where this can happen: > > - The issue attempt yields -EAGAIN (we ran out of requests, etc). If you > look at other commands, if they have data that doesn't fit in the > io_kiocb itself, then they need to allocate room for that data and have > it be persistent > > - Deferral is specified by the application, using eg IOSQE_IO_LINK or > IOSQE_ASYNC. > > We're totally missing support for both of these cases. Consider the case > where the ring is setup with an SQ size of 1. You prep a passthrough > command (request A) and issue it with io_uring_submit(). Due to one of > the two above mentioned conditions, the internal request is deferred. > Either it was sent to ->uring_cmd() but we got -EAGAIN, or it was > deferred even before that happened. The application doesn't know this > happened, it gets another SQE to submit a new request (request B). Fills > it in, calls io_uring_submit(). Since we only have one SQE available in > that ring, when request A gets re-issued, it's now happily reading SQE > contents from command B. Oops. > > This is why prep handlers are the only ones that get an sqe passed to > them. They are supposed to ensure that we no longer read from the SQE > past that. Applications can always rely on that fact that once > io_uring_submit() has been done, which consumes the SQE in the SQ ring, > that no further reads are done from that SQE. > > IOW, we need io_req_prep_async() handling for URING_CMD, which needs to > allocate the full 80 bytes and copy them over. Subsequent issue attempts > will then use that memory rather than the SQE parts. Just need to find a > sane way to do that so we don't need to make the usual prep + direct > issue path any slower. This one should take care of that. Some parts should be folded into patch 1, other bits into the nvme adoption. diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 8c3b15d3e86d..7daa95d50db1 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -411,8 +411,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec) { - struct nvme_uring_cmd *cmd = - (struct nvme_uring_cmd *)ioucmd->cmd; + struct nvme_uring_cmd *cmd = ioucmd->cmd; struct request_queue *q = ns ? ns->queue : ctrl->admin_q; struct nvme_command c; struct request *req; @@ -548,8 +547,8 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return __nvme_ioctl(ns, cmd, (void __user *)arg); } -static void nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd, - unsigned int issue_flags) +static int nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd, + unsigned int issue_flags) { int ret; @@ -567,15 +566,15 @@ static void nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd, ret = -ENOTTY; } - if (ret != -EIOCBQUEUED) - io_uring_cmd_done(ioucmd, ret, 0); + return ret; } -void nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) +int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) { struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev, struct nvme_ns, cdev); - nvme_ns_uring_cmd(ns, ioucmd, issue_flags); + + return nvme_ns_uring_cmd(ns, ioucmd, issue_flags); } #ifdef CONFIG_NVME_MULTIPATH diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 761ad6c629c4..26adc7660d28 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -783,7 +783,7 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long nvme_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -void nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, +int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); void nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); diff --git a/fs/io_uring.c b/fs/io_uring.c index cf7087dc12b3..cee7fc95f2c2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1251,6 +1251,8 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_URING_CMD] = { .needs_file = 1, .plug = 1, + .async_size = 2 * sizeof(struct io_uring_sqe) - + offsetof(struct io_uring_sqe, cmd), }, }; @@ -4936,6 +4938,20 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) } EXPORT_SYMBOL_GPL(io_uring_cmd_done); +static int io_uring_cmd_prep_async(struct io_kiocb *req) +{ + struct io_uring_cmd *ioucmd = &req->uring_cmd; + struct io_ring_ctx *ctx = req->ctx; + size_t cmd_size = sizeof(struct io_uring_sqe) - + offsetof(struct io_uring_sqe, cmd); + + if (ctx->flags & IORING_SETUP_SQE128) + cmd_size += sizeof(struct io_uring_sqe); + + memcpy(req->async_data, ioucmd->cmd, cmd_size); + return 0; +} + static int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -4951,7 +4967,7 @@ static int io_uring_cmd_prep(struct io_kiocb *req, return -EOPNOTSUPP; if (sqe->ioprio || sqe->rw_flags) return -EINVAL; - ioucmd->cmd = sqe->cmd; + ioucmd->cmd = (void *) sqe->cmd; ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); return 0; } @@ -4960,10 +4976,18 @@ static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) { struct file *file = req->file; struct io_uring_cmd *ioucmd = &req->uring_cmd; + int ret; if (!req->file->f_op->uring_cmd) return -EOPNOTSUPP; - file->f_op->uring_cmd(ioucmd, issue_flags); + + if (req_has_async_data(req)) + ioucmd->cmd = req->async_data; + + ret = file->f_op->uring_cmd(ioucmd, issue_flags); + if (ret != -EIOCBQUEUED) + io_uring_cmd_done(ioucmd, ret, 0); + return 0; } @@ -7849,6 +7873,8 @@ static int io_req_prep_async(struct io_kiocb *req) return io_recvmsg_prep_async(req); case IORING_OP_CONNECT: return io_connect_prep_async(req); + case IORING_OP_URING_CMD: + return io_uring_cmd_prep_async(req); } printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n", req->opcode); diff --git a/include/linux/fs.h b/include/linux/fs.h index 30c98d9993df..87b5af1d9fbe 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1996,7 +1996,7 @@ struct file_operations { struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); - void (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); + int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); } __randomize_layout; struct inode_operations { diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index e4cbc80949ce..74371503478a 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -14,10 +14,11 @@ enum io_uring_cmd_flags { struct io_uring_cmd { struct file *file; - const u8 *cmd; + void *cmd; /* callback to defer completions to task context */ void (*task_work_cb)(struct io_uring_cmd *cmd); u32 cmd_op; + u32 pad; u8 pdu[32]; /* available inline for free use */ }; -- Jens Axboe