On Wed, Jan 27, 2021 at 02:25:38PM -0700, Jens Axboe wrote: > This is a file private kind of request. io_uring doesn't know what's > in this command type, it's for the file_operations->uring_cmd() > handler to deal with. > > Signed-off-by: Jens Axboe <axboe@xxxxxxxxx> > --- > fs/io_uring.c | 59 +++++++++++++++++++++++++++++++++++ > include/linux/io_uring.h | 12 +++++++ > include/uapi/linux/io_uring.h | 1 + > 3 files changed, 72 insertions(+) > > diff --git a/fs/io_uring.c b/fs/io_uring.c > index 03748faa5295..55c2714a591e 100644 > --- a/fs/io_uring.c > +++ b/fs/io_uring.c > @@ -712,6 +712,7 @@ struct io_kiocb { > struct io_shutdown shutdown; > struct io_rename rename; > struct io_unlink unlink; > + struct io_uring_cmd uring_cmd; > /* use only after cleaning per-op data, see io_clean_op() */ > struct io_completion compl; > }; > @@ -805,6 +806,8 @@ struct io_op_def { > unsigned needs_async_data : 1; > /* should block plug */ > unsigned plug : 1; > + /* doesn't support personality */ > + unsigned no_personality : 1; > /* size of async data needed, if any */ > unsigned short async_size; > unsigned work_flags; > @@ -998,6 +1001,11 @@ static const struct io_op_def io_op_defs[] = { > .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | > IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, > }, > + [IORING_OP_URING_CMD] = { > + .needs_file = 1, > + .no_personality = 1, > + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, > + }, > }; > > enum io_mem_account { > @@ -3797,6 +3805,47 @@ static int io_unlinkat(struct io_kiocb *req, bool force_nonblock) > return 0; > } > > +static void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret) > +{ > + struct io_kiocb *req = container_of(cmd, struct io_kiocb, uring_cmd); > + > + if (ret < 0) > + req_set_fail_links(req); > + io_req_complete(req, ret); > +} > + > +static int io_uring_cmd_prep(struct io_kiocb *req, > + const struct io_uring_sqe *sqe) > +{ > + struct io_uring_cmd *cmd = &req->uring_cmd; > + > + if (!req->file->f_op->uring_cmd) > + return -EOPNOTSUPP; > + > + memcpy(&cmd->pdu, (void *) &sqe->off, sizeof(cmd->pdu)); Hmmm. struct io_uring_pdu is (by my count) 6x uint64_t (==48 bytes) in size. This starts copying the pdu from byte 8 in struct io_uring_sqe, and the sqe is 64 bytes in size. I guess (having not played much with io_uring) that the stuff in the first eight bytes of the sqe are header info that's common to all io_uring operations, and hence not passed to io_uring_cmd*. Assuming that I got that right, that means that the pdu information doesn't actually go all the way to the end of the sqe, which currently is just a bunch of padding. Was that intentional, or does this mean that io_uring_pdu could actually be 8 bytes longer? Also, I thought io_uring_seq.user_data was supposed to coincide with io_uring_pdu.reserved? They don't seem to...? (I could be totally off here, fwiw.) The reason why I'm counting bytes so stingily is that xfs_scrub issues millions upon millions of ioctl calls to scrub an XFS. Wouldn't it be nice if there was a way to submit a single userspace buffer to the kernel and let it run every scrubber for that fs object in order? I could cram all that data into the pdu struct ... if it had 56 bytes of space. If not, it wouldn't be a big deal to use one of the data[4] fields as a pointer to a larger struct, but where's the fun in that? :) Granted I'm programming speculatively in my head, not building an actual prototype. There are all kinds of other questions I have, like, can a uring command handler access the task struct or the userspace memory of the process it was called from? What happens when the user is madly pounding on ^C while uring commands are running? I should probably figure out the answers to those questions and maybe even write/crib a program first... --D > + cmd->done = io_uring_cmd_done; > + return 0; > +} > + > +static int io_uring_cmd(struct io_kiocb *req, bool force_nonblock) > +{ > + enum io_uring_cmd_flags flags = 0; > + struct file *file = req->file; > + int ret; > + > + if (force_nonblock) > + flags |= IO_URING_F_NONBLOCK; > + > + ret = file->f_op->uring_cmd(&req->uring_cmd, flags); > + /* queued async, consumer will call ->done() when complete */ > + if (ret == -EIOCBQUEUED) > + return 0; > + else if (ret < 0) > + req_set_fail_links(req); > + io_req_complete(req, ret); > + return 0; > +} > + > static int io_shutdown_prep(struct io_kiocb *req, > const struct io_uring_sqe *sqe) > { > @@ -6093,6 +6142,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > return io_renameat_prep(req, sqe); > case IORING_OP_UNLINKAT: > return io_unlinkat_prep(req, sqe); > + case IORING_OP_URING_CMD: > + return io_uring_cmd_prep(req, sqe); > } > > printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", > @@ -6351,6 +6402,9 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, > case IORING_OP_UNLINKAT: > ret = io_unlinkat(req, force_nonblock); > break; > + case IORING_OP_URING_CMD: > + ret = io_uring_cmd(req, force_nonblock); > + break; > default: > ret = -EINVAL; > break; > @@ -6865,6 +6919,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, > if (id) { > struct io_identity *iod; > > + if (io_op_defs[req->opcode].no_personality) > + return -EINVAL; > + > iod = idr_find(&ctx->personality_idr, id); > if (unlikely(!iod)) > return -EINVAL; > @@ -10260,6 +10317,8 @@ static int __init io_uring_init(void) > BUILD_BUG_SQE_ELEM(40, __u16, buf_index); > BUILD_BUG_SQE_ELEM(42, __u16, personality); > BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); > + BUILD_BUG_ON(offsetof(struct io_uring_sqe, user_data) != > + offsetof(struct io_uring_pdu, reserved)); > > BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); > BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int)); > diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h > index 35b2d845704d..e4e822d86e22 100644 > --- a/include/linux/io_uring.h > +++ b/include/linux/io_uring.h > @@ -34,6 +34,18 @@ struct io_uring_task { > bool sqpoll; > }; > > +struct io_uring_pdu { > + __u64 data[4]; /* available for free use */ > + __u64 reserved; /* can't be used by application! */ > + __u64 data2; /* available or free use */ > +}; > + > +struct io_uring_cmd { > + struct file *file; > + struct io_uring_pdu pdu; > + void (*done)(struct io_uring_cmd *, ssize_t); > +}; > + > #if defined(CONFIG_IO_URING) > struct sock *io_uring_get_socket(struct file *file); > void __io_uring_task_cancel(void); > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > index ac4e1738a9af..0a0de40a3a5c 100644 > --- a/include/uapi/linux/io_uring.h > +++ b/include/uapi/linux/io_uring.h > @@ -137,6 +137,7 @@ enum { > IORING_OP_SHUTDOWN, > IORING_OP_RENAMEAT, > IORING_OP_UNLINKAT, > + IORING_OP_URING_CMD, > > /* this goes last, obviously */ > IORING_OP_LAST, > -- > 2.30.0 >