On 1/16/20 3:35 PM, Stefan Metzmacher wrote: > Am 16.01.20 um 18:23 schrieb Jens Axboe: >> The application currently has no way of knowing if a given opcode is >> supported or not without having to try and issue one and see if we get >> -EINVAL or not. And even this approach is fraught with peril, as maybe >> we're getting -EINVAL due to some fields being missing, or maybe it's >> just not that easy to issue that particular command without doing some >> other leg work in terms of setup first. >> >> This adds IORING_REGISTER_PROBE, which fills in a structure with info >> on what it supported or not. This will work even with sparse opcode >> fields, which may happen in the future or even today if someone >> backports specific features to older kernels. > > That's funny I was just thinking about exactly that topic before > I opened the io-uring mail folder:-) > > That's will make it much easier to write a portable > vfs backend for samba that doesn't depend on the kernel > features at build time. > >> + p->last_op = IORING_OP_LAST - 1; >> + /* stock kernel isn't sparse, so everything is supported */ >> + for (i = 0; i < nr_args; i++) { >> + p->ops[i].op = i; > > Shouldn't there be an if (i <= p->last_op) before we pretent to support > an opcode? Or we need to truncate nr_args Yeah, I made some edits, just didn't post v2 yet. Below is the current one: diff --git a/fs/io_uring.c b/fs/io_uring.c index ee14a0fcd59f..b20587bda5d4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -561,6 +561,8 @@ struct io_op_def { unsigned hash_reg_file : 1; /* unbound wq insertion if file is a non-regular file */ unsigned unbound_nonreg_file : 1; + /* opcode is not supported by this kernel */ + unsigned not_supported : 1; }; static const struct io_op_def io_op_defs[] = { @@ -6554,6 +6556,45 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, return io_uring_setup(entries, params); } +static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) +{ + struct io_uring_probe *p; + size_t size; + int i, ret; + + size = struct_size(p, ops, nr_args); + if (size == SIZE_MAX) + return -EOVERFLOW; + p = kzalloc(size, GFP_KERNEL); + if (!p) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(p, arg, size)) + goto out; + ret = -EINVAL; + if (memchr_inv(p, 0, size)) + goto out; + + p->last_op = IORING_OP_LAST - 1; + if (nr_args > IORING_OP_LAST) + nr_args = IORING_OP_LAST; + + for (i = 0; i < nr_args; i++) { + p->ops[i].op = i; + if (!io_op_defs[i].not_supported) + p->ops[i].flags = IO_URING_OP_SUPPORTED; + } + p->ops_len = i; + + ret = 0; + if (copy_to_user(arg, p, size)) + ret = -EFAULT; +out: + kfree(p); + return ret; +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -6570,7 +6611,8 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, return -ENXIO; if (opcode != IORING_UNREGISTER_FILES && - opcode != IORING_REGISTER_FILES_UPDATE) { + opcode != IORING_REGISTER_FILES_UPDATE && + opcode != IORING_REGISTER_PROBE) { percpu_ref_kill(&ctx->refs); /* @@ -6632,6 +6674,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_eventfd_unregister(ctx); break; + case IORING_REGISTER_PROBE: + ret = -EINVAL; + if (!arg || nr_args > 256) + break; + ret = io_probe(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; @@ -6639,7 +6687,8 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (opcode != IORING_UNREGISTER_FILES && - opcode != IORING_REGISTER_FILES_UPDATE) { + opcode != IORING_REGISTER_FILES_UPDATE && + opcode != IORING_REGISTER_PROBE) { /* bring the ctx back to life */ percpu_ref_reinit(&ctx->refs); out: diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index fea7da182851..955fd477e530 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -194,6 +194,7 @@ struct io_uring_params { #define IORING_UNREGISTER_EVENTFD 5 #define IORING_REGISTER_FILES_UPDATE 6 #define IORING_REGISTER_EVENTFD_ASYNC 7 +#define IORING_REGISTER_PROBE 8 struct io_uring_files_update { __u32 offset; @@ -201,4 +202,21 @@ struct io_uring_files_update { __aligned_u64 /* __s32 * */ fds; }; +#define IO_URING_OP_SUPPORTED (1U << 0) + +struct io_uring_probe_op { + __u8 op; + __u8 resv; + __u16 flags; /* IO_URING_OP_* flags */ + __u32 resv2; +}; + +struct io_uring_probe { + __u8 last_op; /* last opcode supported */ + __u8 ops_len; /* length of ops[] array below */ + __u16 resv; + __u32 resv2[3]; + struct io_uring_probe_op ops[0]; +}; + #endif -- Jens Axboe