On Tue, Jul 14 2020 at 14:08:26 -0700, Josh Triplett quoth thus: > Add a new operation IORING_OP_OPENAT2_FIXED_FILE, which opens a file > into the fixed-file table rather than installing a file descriptor. > Using a new operation avoids having an IOSQE flag that almost all > operations will need to ignore; io_openat2_fixed_file also has > substantially different control-flow than io_openat2, and it can avoid > requiring the file table if not needed for the dirfd. > > (This intentionally does not use the IOSQE_FIXED_FILE flag, because > semantically, IOSQE_FIXED_FILE for openat2 should mean to interpret the > dirfd as a fixed-file-table index, and that would be useful future > behavior for both IORING_OP_OPENAT2 and IORING_OP_OPENAT2_FIXED_FILE.) > > Create a new io_sqe_files_add_new function to add a single new file to > the fixed-file table. This function returns -EBUSY if attempting to > overwrite an existing file. > > Provide a new field to pass along the fixed-file-table index for an > open-like operation; future operations such as > IORING_OP_ACCEPT_FIXED_FILE can use the same index. > > Signed-off-by: Josh Triplett <josh@xxxxxxxxxxxxxxxx> > --- > > (Should this check for and reject open flags like O_CLOEXEC that only > affect the file descriptor?) > > I've tested this (and I'll send my liburing patch momentarily), and it > works fine if you do the open in one batch and operate on the fixed-file > in another batch. As discussed via Twitter, opening and operating on a > file in the same batch will require changing other operations to obtain > their fixed-file entries later, post-prep. > > It might make sense to do and test that for one operation at a time, and > add a .late_fixed_file flag to the operation definition for operations > that support that. > > It might also make sense to have the prep for > IORING_OP_OPENAT2_FIXED_FILE stick an indication in the fixed-file table > that there *will* be a file there later, perhaps an > ERR_PTR(-EINPROGRESS), and make sure there isn't one already, to detect > potential errors earlier and to let the prep for other operations > confirm that there *will* be a file; on the other hand, that would mean > there's an invalid non-NULL file pointer in the fixed file table, which > seems potentially error-prone if any operation ever forgets that. > > The other next step would be to add an IORING_OP_CLOSE_FIXED_FILE > (separate from the existing CLOSE op) that removes an entry currently in > the fixed file table and calls fput on it. (With some care, that > *should* be possible even for an entry that was originally registered > from a file descriptor.) > > And finally, we should have an IORING_OP_FIXED_FILE_TO_FD operation, > which calls get_unused_fd_flags (with specified flags to allow for > O_CLOEXEC) and then fd_install. That allows opening a file via io_uring, > operating on it via the ring, but then also operating on it via other > syscalls (or inheriting it or anything else you can do with a file > descriptor). I'd been thinking about fixed file operations previous to your post, so I'm happy to see this work. I see IORING_OP_FIXED_FILE_TO_FD as a dup() function from fixed file to process descriptor space. It would be nice if it would take parameters to select the functionality of dup, dup2, dup3, F_DUPFD, and F_DUPFD_CLOEXEC. As I recall, O_CLOFORK is on its way from Posix-land, so I'd think there will also be something like F_DUPFD_CLOFORK coming. > It would be useful if IORING_REGISTER_xxx_UPDATE would accept a placeholder value to ask the kernel not to mess with that index. I think AT_FDCWD would be a good choice. > fs/io_uring.c | 90 ++++++++++++++++++++++++++++++++++- > include/uapi/linux/io_uring.h | 6 ++- > 2 files changed, 94 insertions(+), 2 deletions(-) > > diff --git a/fs/io_uring.c b/fs/io_uring.c > index 9fd7e69696c3..df6f017ef8e8 100644 > --- a/fs/io_uring.c > +++ b/fs/io_uring.c > @@ -425,6 +425,7 @@ struct io_sr_msg { > struct io_open { > struct file *file; > int dfd; > + u32 open_fixed_idx; > struct filename *filename; > struct open_how how; > unsigned long nofile; > @@ -878,6 +879,10 @@ static const struct io_op_def io_op_defs[] = { > .hash_reg_file = 1, > .unbound_nonreg_file = 1, > }, > + [IORING_OP_OPENAT2_FIXED_FILE] = { > + .file_table = 1, > + .needs_fs = 1, > + }, > }; > > static void io_wq_submit_work(struct io_wq_work **workptr); > @@ -886,6 +891,9 @@ static void io_put_req(struct io_kiocb *req); > static void __io_double_put_req(struct io_kiocb *req); > static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); > static void io_queue_linked_timeout(struct io_kiocb *req); > +static int io_sqe_files_add_new(struct io_ring_ctx *ctx, > + u32 index, > + struct file *file); > static int __io_sqe_files_update(struct io_ring_ctx *ctx, > struct io_uring_files_update *ip, > unsigned nr_args); > @@ -3060,10 +3068,48 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > len); > if (ret) > return ret; > + req->open.open_fixed_idx = READ_ONCE(sqe->open_fixed_idx); > > return __io_openat_prep(req, sqe); > } > > +static int io_openat2_fixed_file(struct io_kiocb *req, bool force_nonblock) > +{ > + struct io_open *open = &req->open; > + struct open_flags op; > + struct file *file; > + int ret; > + > + if (force_nonblock) { > + /* only need file table for an actual valid fd */ > + if (open->dfd == -1 || open->dfd == AT_FDCWD) > + req->flags |= REQ_F_NO_FILE_TABLE; > + return -EAGAIN; > + } > + > + ret = build_open_flags(&open->how, &op); > + if (ret) > + goto err; > + > + file = do_filp_open(open->dfd, open->filename, &op); > + if (IS_ERR(file)) { > + ret = PTR_ERR(file); > + } else { > + fsnotify_open(file); > + ret = io_sqe_files_add_new(req->ctx, open->open_fixed_idx, file); > + if (ret) > + fput(file); > + } > +err: > + putname(open->filename); > + req->flags &= ~REQ_F_NEED_CLEANUP; > + if (ret < 0) > + req_set_fail_links(req); > + io_cqring_add_event(req, ret); > + io_put_req(req); > + return 0; > +} > + > static int io_openat2(struct io_kiocb *req, bool force_nonblock) > { > struct open_flags op; > @@ -5048,6 +5094,7 @@ static int io_req_defer_prep(struct io_kiocb *req, > ret = io_madvise_prep(req, sqe); > break; > case IORING_OP_OPENAT2: > + case IORING_OP_OPENAT2_FIXED_FILE: > ret = io_openat2_prep(req, sqe); > break; > case IORING_OP_EPOLL_CTL: > @@ -5135,6 +5182,7 @@ static void io_cleanup_req(struct io_kiocb *req) > break; > case IORING_OP_OPENAT: > case IORING_OP_OPENAT2: > + case IORING_OP_OPENAT2_FIXED_FILE: > break; > case IORING_OP_SPLICE: > case IORING_OP_TEE: > @@ -5329,12 +5377,17 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, > ret = io_madvise(req, force_nonblock); > break; > case IORING_OP_OPENAT2: > + case IORING_OP_OPENAT2_FIXED_FILE: > if (sqe) { > ret = io_openat2_prep(req, sqe); > if (ret) > break; > } > - ret = io_openat2(req, force_nonblock); > + if (req->opcode == IORING_OP_OPENAT2) { > + ret = io_openat2(req, force_nonblock); > + } else { > + ret = io_openat2_fixed_file(req, force_nonblock); > + } > break; > case IORING_OP_EPOLL_CTL: > if (sqe) { > @@ -6791,6 +6844,41 @@ static int io_queue_file_removal(struct fixed_file_data *data, > return 0; > } > > +/* > + * Add a single new file in an empty entry of the fixed file table. Does not > + * allow overwriting an existing entry; returns -EBUSY in that case. > + */ > +static int io_sqe_files_add_new(struct io_ring_ctx *ctx, > + u32 index, > + struct file *file) > +{ > + struct fixed_file_table *table; > + u32 i; > + int err; > + > + if (unlikely(index > ctx->nr_user_files)) > + return -EINVAL; > + i = array_index_nospec(index, ctx->nr_user_files); > + table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; > + index = i & IORING_FILE_TABLE_MASK; > + if (unlikely(table->files[index])) > + return -EBUSY; > + /* > + * Don't allow io_uring instances to be registered. If UNIX isn't > + * enabled, then this causes a reference cycle and this instance can > + * never get freed. If UNIX is enabled we'll handle it just fine, but > + * there's still no point in allowing a ring fd as it doesn't support > + * regular read/write anyway. > + */ > + if (unlikely(file->f_op == &io_uring_fops)) > + return -EBADF; > + err = io_sqe_file_register(ctx, file, i); > + if (err) > + return err; > + table->files[index] = file; > + return 0; > +} > + > static int __io_sqe_files_update(struct io_ring_ctx *ctx, > struct io_uring_files_update *up, > unsigned nr_args) > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > index 7843742b8b74..95f107e6f65e 100644 > --- a/include/uapi/linux/io_uring.h > +++ b/include/uapi/linux/io_uring.h > @@ -54,7 +54,10 @@ struct io_uring_sqe { > } __attribute__((packed)); > /* personality to use, if used */ > __u16 personality; > - __s32 splice_fd_in; > + union { > + __s32 splice_fd_in; > + __s32 open_fixed_idx; > + }; > }; > __u64 __pad2[3]; > }; > @@ -130,6 +133,7 @@ enum { > IORING_OP_PROVIDE_BUFFERS, > IORING_OP_REMOVE_BUFFERS, > IORING_OP_TEE, > + IORING_OP_OPENAT2_FIXED_FILE, > > /* this goes last, obviously */ > IORING_OP_LAST, > -- > 2.28.0.rc0