On Mon, Nov 11, 2024 at 01:50:45AM +0000, Pavel Begunkov wrote: > Let the user to register a BPF_PROG_TYPE_IOURING BPF program to a ring. > The progrma will be run in the waiting loop every time something > happens, i.e. the task was woken up by a task_work / signal / etc. > > Signed-off-by: Pavel Begunkov <asml.silence@xxxxxxxxx> > --- > include/linux/io_uring_types.h | 4 +++ > include/uapi/linux/io_uring.h | 9 +++++ > io_uring/bpf.c | 63 ++++++++++++++++++++++++++++++++++ > io_uring/bpf.h | 41 ++++++++++++++++++++++ > io_uring/io_uring.c | 15 ++++++++ > io_uring/register.c | 7 ++++ > 6 files changed, 139 insertions(+) > create mode 100644 io_uring/bpf.h > > diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h > index ad5001102c86..50cee0d3622e 100644 > --- a/include/linux/io_uring_types.h > +++ b/include/linux/io_uring_types.h > @@ -8,6 +8,8 @@ > #include <linux/llist.h> > #include <uapi/linux/io_uring.h> > > +struct io_bpf_ctx; > + > enum { > /* > * A hint to not wake right away but delay until there are enough of > @@ -246,6 +248,8 @@ struct io_ring_ctx { > > enum task_work_notify_mode notify_method; > unsigned sq_thread_idle; > + > + struct io_bpf_ctx *bpf_ctx; > } ____cacheline_aligned_in_smp; > > /* submission data */ > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > index ba373deb8406..f2c2fefc8514 100644 > --- a/include/uapi/linux/io_uring.h > +++ b/include/uapi/linux/io_uring.h > @@ -634,6 +634,8 @@ enum io_uring_register_op { > /* register fixed io_uring_reg_wait arguments */ > IORING_REGISTER_CQWAIT_REG = 34, > > + IORING_REGISTER_BPF = 35, > + > /* this goes last */ > IORING_REGISTER_LAST, > > @@ -905,6 +907,13 @@ enum io_uring_socket_op { > SOCKET_URING_OP_SETSOCKOPT, > }; > > +struct io_uring_bpf_reg { > + __u64 prog_fd; > + __u32 flags; > + __u32 resv1; > + __u64 resv2[2]; > +}; > + > #ifdef __cplusplus > } > #endif > diff --git a/io_uring/bpf.c b/io_uring/bpf.c > index 6eb0c47b4aa9..8b7c74761c63 100644 > --- a/io_uring/bpf.c > +++ b/io_uring/bpf.c > @@ -1,6 +1,9 @@ > // SPDX-License-Identifier: GPL-2.0 > > #include <linux/bpf.h> > +#include <linux/filter.h> > + > +#include "bpf.h" > > static const struct bpf_func_proto * > io_bpf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > @@ -22,3 +25,63 @@ const struct bpf_verifier_ops bpf_io_uring_verifier_ops = { > .get_func_proto = io_bpf_func_proto, > .is_valid_access = io_bpf_is_valid_access, > }; > + > +int io_run_bpf(struct io_ring_ctx *ctx) > +{ > + struct io_bpf_ctx *bc = ctx->bpf_ctx; > + int ret; > + > + mutex_lock(&ctx->uring_lock); > + ret = bpf_prog_run_pin_on_cpu(bc->prog, bc); > + mutex_unlock(&ctx->uring_lock); > + return ret; > +} > + > +int io_unregister_bpf(struct io_ring_ctx *ctx) > +{ > + struct io_bpf_ctx *bc = ctx->bpf_ctx; > + > + if (!bc) > + return -ENXIO; > + bpf_prog_put(bc->prog); > + kfree(bc); > + ctx->bpf_ctx = NULL; > + return 0; > +} > + > +int io_register_bpf(struct io_ring_ctx *ctx, void __user *arg, > + unsigned int nr_args) > +{ > + struct __user io_uring_bpf_reg *bpf_reg_usr = arg; > + struct io_uring_bpf_reg bpf_reg; > + struct io_bpf_ctx *bc; > + struct bpf_prog *prog; > + > + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) > + return -EOPNOTSUPP; > + > + if (nr_args != 1) > + return -EINVAL; > + if (copy_from_user(&bpf_reg, bpf_reg_usr, sizeof(bpf_reg))) > + return -EFAULT; > + if (bpf_reg.flags || bpf_reg.resv1 || > + bpf_reg.resv2[0] || bpf_reg.resv2[1]) > + return -EINVAL; > + > + if (ctx->bpf_ctx) > + return -ENXIO; > + > + bc = kzalloc(sizeof(*bc), GFP_KERNEL); > + if (!bc) > + return -ENOMEM; > + > + prog = bpf_prog_get_type(bpf_reg.prog_fd, BPF_PROG_TYPE_IOURING); > + if (IS_ERR(prog)) { > + kfree(bc); > + return PTR_ERR(prog); > + } > + > + bc->prog = prog; > + ctx->bpf_ctx = bc; > + return 0; > +} > diff --git a/io_uring/bpf.h b/io_uring/bpf.h > new file mode 100644 > index 000000000000..2b4e555ff07a > --- /dev/null > +++ b/io_uring/bpf.h > @@ -0,0 +1,41 @@ > +// SPDX-License-Identifier: GPL-2.0 > +#ifndef IOU_BPF_H > +#define IOU_BPF_H > + > +#include <linux/io_uring/bpf.h> > +#include <linux/io_uring_types.h> > + > +struct bpf_prog; > + > +struct io_bpf_ctx { > + struct io_bpf_ctx_kern kern; > + struct bpf_prog *prog; > +}; > + > +static inline bool io_bpf_enabled(struct io_ring_ctx *ctx) > +{ > + return IS_ENABLED(CONFIG_BPF) && ctx->bpf_ctx != NULL; > +} > + > +#ifdef CONFIG_BPF > +int io_register_bpf(struct io_ring_ctx *ctx, void __user *arg, > + unsigned int nr_args); > +int io_unregister_bpf(struct io_ring_ctx *ctx); > +int io_run_bpf(struct io_ring_ctx *ctx); > + > +#else > +static inline int io_register_bpf(struct io_ring_ctx *ctx, void __user *arg, > + unsigned int nr_args) > +{ > + return -EOPNOTSUPP; > +} > +static inline int io_unregister_bpf(struct io_ring_ctx *ctx) > +{ > + return -EOPNOTSUPP; > +} > +static inline int io_run_bpf(struct io_ring_ctx *ctx) > +{ > +} > +#endif > + > +#endif > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c > index f34fa1ead2cf..82599e2a888a 100644 > --- a/io_uring/io_uring.c > +++ b/io_uring/io_uring.c > @@ -104,6 +104,7 @@ > #include "rw.h" > #include "alloc_cache.h" > #include "eventfd.h" > +#include "bpf.h" > > #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ > IOSQE_IO_HARDLINK | IOSQE_ASYNC) > @@ -2834,6 +2835,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, > > io_napi_busy_loop(ctx, &iowq); > > + if (io_bpf_enabled(ctx)) { > + ret = io_run_bpf(ctx); > + if (ret == IOU_BPF_RET_STOP) > + return 0; > + } > + > trace_io_uring_cqring_wait(ctx, min_events); > do { > unsigned long check_cq; > @@ -2879,6 +2886,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, > if (ret < 0) > break; > > + if (io_bpf_enabled(ctx)) { > + ret = io_run_bpf(ctx); > + if (ret == IOU_BPF_RET_STOP) > + break; > + continue; > + } I believe 'struct_ops' is much simpler to run the prog and return the result. Then you needn't any bpf core change and the bpf register code. Thanks, Ming