On 11/15/21 9:42 PM, Kumar Kartikeya Dwivedi wrote:
This change adds eBPF iterator for buffers registered in io_uring ctx.
It gives access to the ctx, the index of the registered buffer, and a
pointer to the struct file itself. This allows the iterator to save
info related to the file added to an io_uring instance, that isn't easy
to export using the fdinfo interface (like being able to match
registered files to a task's file set). Getting access to underlying
struct file allows deduplication and efficient pairing with task file
set (obtained using task_file iterator).
The primary usecase this is enabling is checkpoint/restore support.
Note that we need to use mutex_trylock when the file is read from, in
seq_start functions, as the order of lock taken is opposite of what it
would be when io_uring operation reads the same file. We take
seq_file->lock, then ctx->uring_lock, while io_uring would first take
ctx->uring_lock and then seq_file->lock for the same ctx.
This can lead to a deadlock scenario described below:
CPU 0 CPU 1
vfs_read
mutex_lock(&seq_file->lock) io_read
mutex_lock(&ctx->uring_lock)
mutex_lock(&ctx->uring_lock) # switched to mutex_trylock
mutex_lock(&seq_file->lock)
The trylock also protects the case where io_uring tries to read from
iterator attached to itself (same ctx), where the order of locks would
be:
io_uring_enter
mutex_lock(&ctx->uring_lock) <-----------.
io_read \
seq_read \
mutex_lock(&seq_file->lock) /
mutex_lock(&ctx->uring_lock) # deadlock-`
In both these cases (recursive read and contended uring_lock), -EDEADLK
is returned to userspace.
With the advent of descriptorless files supported by io_uring, this
iterator provides the required visibility and introspection of io_uring
instance for the purposes of dumping and restoring it.
In the future, this iterator will be extended to support direct
inspection of a lot of file state (currently descriptorless files
are obtained using openat2 and socket) to dump file state for these
hidden files. Later, we can explore filling in the gaps for dumping
file state for more file types (those not hidden in io_uring ctx).
All this is out of scope for the current series however, but builds
upon this iterator.
Cc: Jens Axboe <axboe@xxxxxxxxx>
Cc: Pavel Begunkov <asml.silence@xxxxxxxxx>
Cc: io-uring@xxxxxxxxxxxxxxx
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@xxxxxxxxx>
---
fs/io_uring.c | 140 +++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 139 insertions(+), 1 deletion(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9e9df6767e29..7ac479c95d4e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -11132,6 +11132,7 @@ __initcall(io_uring_init);
BTF_ID_LIST(btf_io_uring_ids)
BTF_ID(struct, io_ring_ctx)
BTF_ID(struct, io_mapped_ubuf)
+BTF_ID(struct, file)
struct bpf_io_uring_seq_info {
struct io_ring_ctx *ctx;
@@ -11312,11 +11313,148 @@ const struct bpf_func_proto bpf_page_to_pfn_proto = {
.arg1_btf_id = &btf_page_to_pfn_ids[0],
};
+/* io_uring iterator for registered files */
+
+struct bpf_iter__io_uring_file {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct io_ring_ctx *, ctx);
+ __bpf_md_ptr(struct file *, file);
+ unsigned long index;
change "unisnged long" to either u32 or u64, maybe just u64?
+};
+
+static void *__bpf_io_uring_file_seq_get_next(struct bpf_io_uring_seq_info *info)
+{
+ struct file *file = NULL;
+
+ if (info->index < info->ctx->nr_user_files) {
+ /* file set can be sparse */
+ file = io_file_from_index(info->ctx, info->index++);
+ /* use info as a distinct pointer to distinguish between empty
+ * slot and valid file, since we cannot return NULL for this
+ * case if we want iter prog to still be invoked with file ==
+ * NULL.
+ */
+ if (!file)
+ return info;
+ }
+
+ return file;
+}
+
+static void *bpf_io_uring_file_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_io_uring_seq_info *info = seq->private;
+ struct file *file;
+
+ /* Indicate to userspace that the uring lock is contended */
+ if (!mutex_trylock(&info->ctx->uring_lock))
+ return ERR_PTR(-EDEADLK);
+
+ file = __bpf_io_uring_file_seq_get_next(info);
+ if (!file)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ return file;
+}
+
+static void *bpf_io_uring_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_io_uring_seq_info *info = seq->private;
+
+ ++*pos;
+ return __bpf_io_uring_file_seq_get_next(info);
+}
+
+DEFINE_BPF_ITER_FUNC(io_uring_file, struct bpf_iter_meta *meta,
+ struct io_ring_ctx *ctx, struct file *file,
+ unsigned long index)
unsigned long => u64?
+
+static int __bpf_io_uring_file_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+ struct bpf_io_uring_seq_info *info = seq->private;
+ struct bpf_iter__io_uring_file ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.ctx = info->ctx;
+ /* when we encounter empty slot, v will point to info */
+ ctx.file = v == info ? NULL : v;
+ ctx.index = info->index ? info->index - !in_stop : 0;
+
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int bpf_io_uring_file_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_io_uring_file_seq_show(seq, v, false);
+}
+
[...]