Add a new helper function that can schedule a callback to execute in a different context. Initially, only irq_work (i.e. hardirq) is supported. A key consideration is that we need this to work in an NMI context. Therefore, we use a queue of pre-allocated llist nodes inside bpf_delayed_work, which we drain on a per-program basis. To avoid races on the bpf_delayed_work items, we implement a simple lock scheme based on cmpxchg ordering. Signed-off-by: Delyan Kratunov <delyank@xxxxxx> --- include/linux/bpf.h | 13 ++++ include/uapi/linux/bpf.h | 28 ++++++++ kernel/bpf/core.c | 8 +++ kernel/bpf/helpers.c | 92 ++++++++++++++++++++++++ kernel/bpf/verifier.c | 123 ++++++++++++++++++++++++++++++++- scripts/bpf_doc.py | 2 + tools/include/uapi/linux/bpf.h | 27 ++++++++ 7 files changed, 292 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ad9d2cfb0411..7325a9a2d10b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -27,6 +27,8 @@ #include <linux/bpfptr.h> #include <linux/btf.h> #include <linux/rcupdate_trace.h> +#include <linux/irq_work.h> +#include <linux/llist.h> struct bpf_verifier_env; struct bpf_verifier_log; @@ -460,6 +462,7 @@ enum bpf_arg_type { ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ ARG_PTR_TO_KPTR, /* pointer to referenced kptr */ ARG_PTR_TO_DYNPTR, /* pointer to bpf_dynptr. See bpf_type_flag for dynptr type */ + ARG_PTR_TO_DELAYED_WORK,/* pointer to bpf_delayed_work */ __BPF_ARG_TYPE_MAX, /* Extended arg_types. */ @@ -1101,6 +1104,9 @@ struct bpf_prog_aux { u32 linfo_idx; u32 num_exentries; struct exception_table_entry *extable; + + /* initialized at load time if program uses delayed work helpers */ + struct bpf_delayed_irq_work *irq_work; union { struct work_struct work; struct rcu_head rcu; @@ -2526,4 +2532,11 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); int bpf_dynptr_check_size(u32 size); +struct bpf_delayed_irq_work { + struct llist_head items; + struct irq_work work; + struct bpf_prog *prog; +}; +void bpf_delayed_work_irq_work_cb(struct irq_work *work); + #endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d68fc4f472f1..dc0587bbbe7c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5325,6 +5325,29 @@ union bpf_attr { * **-EACCES** if the SYN cookie is not valid. * * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + * + * + * long bpf_delayed_work_submit(struct bpf_delayed_work *work, void *cb, void *data, int flags) + * Description + * Submits a function to execute in a different context. + * + * *work* must be a member in a map value. + * + * *cb* function to call + * + * *data* context to pass as sole argument to *cb*. Must be part of + * a map value or NULL. + * + * *flags* must be BPF_DELAYED_WORK_IRQWORK + * Return + * 0 when work is successfully submitted. + * + * **-EINVAL** if *cb* is NULL + * + * **-EOPNOTSUP** if called from an NMI handler on an + * architecture without NMI-safe cmpxchg + * + * **-EINVAL** if *work* is already in use */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5535,6 +5558,7 @@ union bpf_attr { FN(tcp_raw_gen_syncookie_ipv6), \ FN(tcp_raw_check_syncookie_ipv4), \ FN(tcp_raw_check_syncookie_ipv6), \ + FN(delayed_work_submit), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -6699,6 +6723,10 @@ struct bpf_delayed_work { __u64 :64; } __attribute__((aligned(8))); +enum { + BPF_DELAYED_WORK_IRQWORK = (1UL << 0), +}; + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b5ffebcce6cc..1f5093f9442b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2567,6 +2567,14 @@ static void bpf_prog_free_deferred(struct work_struct *work) int i; aux = container_of(work, struct bpf_prog_aux, work); + + /* We have already waited for a qs of the appropriate RCU variety, + * so we can expect no further submissions of work. Just wait for + * the currently scheduled work to finish before releasing anything. + */ + if (aux->irq_work) + irq_work_sync(&aux->irq_work->work); + #ifdef CONFIG_BPF_SYSCALL bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab); #endif diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a1c84d256f83..731547d34c35 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -18,6 +18,8 @@ #include <linux/proc_ns.h> #include <linux/security.h> #include <linux/btf_ids.h> +#include <linux/irq_work.h> +#include <linux/llist.h> #include "../../lib/kstrtox.h" @@ -1575,6 +1577,94 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = { .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, }; +struct bpf_delayed_work_kern { + struct llist_node item; + u64 flags; /* used as a lock field */ + void (*cb)(void *); + void *data; +} __aligned(8); + +#define BPF_DELAYED_WORK_FREE (0) +#define BPF_DELAYED_WORK_CLAIMED (1) +#define BPF_DELAYED_WORK_READY (2) + +void bpf_delayed_work_irq_work_cb(struct irq_work *work) +{ + struct bpf_delayed_irq_work *bpf_irq_work = container_of(work, struct bpf_delayed_irq_work, work); + struct bpf_delayed_work_kern *work_item, *next; + struct llist_node *work_list = llist_del_all(&bpf_irq_work->items); + + /* Traverse in submission order to preserve ordering semantics */ + llist_reverse_order(work_list); + + llist_for_each_entry_safe(work_item, next, work_list, item) { + WARN_ONCE(work_item->flags != BPF_DELAYED_WORK_READY, "incomplete bpf_delayed_work found"); + + work_item->cb(work_item->data); + + work_item->cb = work_item->data = NULL; + bpf_prog_put(bpf_irq_work->prog); + xchg(&work_item->flags, BPF_DELAYED_WORK_FREE); + } +} + +BPF_CALL_5(bpf_delayed_work_submit, struct bpf_delayed_work_kern *, work, + void *, callback_fn, void *, data, int, flags, struct bpf_prog_aux *, aux) +{ + u64 ret; + struct bpf_prog *prog; + + BUILD_BUG_ON(sizeof(struct bpf_delayed_work_kern) > sizeof(struct bpf_delayed_work)); + BUILD_BUG_ON(__alignof__(struct bpf_delayed_work_kern) != __alignof__(struct bpf_delayed_work)); + BTF_TYPE_EMIT(struct bpf_delayed_work); + + if (callback_fn == NULL) + return -EINVAL; + + if (flags != BPF_DELAYED_WORK_IRQWORK) + return -EOPNOTSUPP; + + if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) && in_nmi()) + return -EOPNOTSUPP; + + ret = cmpxchg(&work->flags, BPF_DELAYED_WORK_FREE, BPF_DELAYED_WORK_CLAIMED); + if (ret != 0) + return -EINVAL; + + work->data = data; + work->cb = callback_fn; + + ret = cmpxchg(&work->flags, BPF_DELAYED_WORK_CLAIMED, BPF_DELAYED_WORK_READY); + if (ret != BPF_DELAYED_WORK_CLAIMED) { + WARN_ONCE(ret != BPF_DELAYED_WORK_CLAIMED, "bpf_delayed_work item altered while claimed"); + return -EINVAL; + } + + /* Bump the ref count for every work item submitted by the program. */ + prog = bpf_prog_inc_not_zero(aux->prog); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + llist_add(&work->item, &aux->irq_work->items); + + /* It's okay if this prog's irq_work is already submitted, + * it will walk the same list of callbacks anyway. + */ + (void) irq_work_queue(&aux->irq_work->work); + + return 0; +} + +const struct bpf_func_proto bpf_delayed_work_submit_proto = { + .func = bpf_delayed_work_submit, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_DELAYED_WORK, + .arg2_type = ARG_PTR_TO_FUNC, + .arg3_type = ARG_PTR_TO_MAP_VALUE, /* TODO: need ptr_to_map_value_mem */ + .arg4_type = ARG_ANYTHING, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_get_current_task_btf_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; @@ -1643,6 +1733,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_dynptr_write_proto; case BPF_FUNC_dynptr_data: return &bpf_dynptr_data_proto; + case BPF_FUNC_delayed_work_submit: + return &bpf_delayed_work_submit_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9fd311b7a1ff..212cbea5a382 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5490,6 +5490,55 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, return 0; } +static int process_delayed_work_func(struct bpf_verifier_env *env, int regno, + struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + bool is_const = tnum_is_const(reg->var_off); + struct bpf_map *map = reg->map_ptr; + u64 val = reg->var_off.value; + + if (!is_const) { + verbose(env, + "R%d doesn't have constant offset. bpf_delayed_work has to be at the constant offset\n", + regno); + return -EINVAL; + } + if (!map->btf) { + verbose(env, "map '%s' has to have BTF in order to use bpf_delayed_work\n", + map->name); + return -EINVAL; + } + if (!map_value_has_delayed_work(map)) { + if (map->delayed_work_off == -E2BIG) + verbose(env, + "map '%s' has more than one 'struct bpf_delayed_work'\n", + map->name); + else if (map->delayed_work_off == -ENOENT) + verbose(env, + "map '%s' doesn't have 'struct bpf_delayed_work'\n", + map->name); + else + verbose(env, + "map '%s' is not a struct type or bpf_delayed_work is mangled\n", + map->name); + return -EINVAL; + } + if (map->delayed_work_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct bpf_delayed_work' that is at %d\n", + val + reg->off, map->delayed_work_off); + return -EINVAL; + } + if (meta->map_ptr) { + verbose(env, "verifier bug. Two map pointers in a timer helper\n"); + return -EFAULT; + } + + meta->map_uid = reg->map_uid; + meta->map_ptr = map; + return 0; +} + static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { @@ -5677,6 +5726,7 @@ static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types delayed_work_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -5704,6 +5754,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_TIMER] = &timer_types, [ARG_PTR_TO_KPTR] = &kptr_types, [ARG_PTR_TO_DYNPTR] = &stack_ptr_types, + [ARG_PTR_TO_DELAYED_WORK] = &delayed_work_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -6018,6 +6069,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, } else if (arg_type == ARG_PTR_TO_TIMER) { if (process_timer_func(env, regno, meta)) return -EACCES; + } else if (arg_type == ARG_PTR_TO_DELAYED_WORK) { + if (process_delayed_work_func(env, regno, meta)) + return -EACCES; } else if (arg_type == ARG_PTR_TO_FUNC) { meta->subprogno = reg->subprogno; } else if (base_type(arg_type) == ARG_PTR_TO_MEM) { @@ -6670,7 +6724,8 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (insn->code == (BPF_JMP | BPF_CALL) && insn->src_reg == 0 && - insn->imm == BPF_FUNC_timer_set_callback) { + (insn->imm == BPF_FUNC_timer_set_callback || + insn->imm == BPF_FUNC_delayed_work_submit)) { struct bpf_verifier_state *async_cb; /* there is no real recursion here. timer callbacks are async */ @@ -6898,6 +6953,30 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_delayed_work_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + /* bpf_delayed_work_submit(struct bpf_delayed_work *work, + * void *callback_fn, void *data, u64 flags); + * + * callback_fn(void *callback_ctx); + */ + callee->regs[BPF_REG_1].type = PTR_TO_MAP_VALUE; + __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + callee->regs[BPF_REG_1].map_ptr = caller->regs[BPF_REG_3].map_ptr; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_2]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_3]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + + callee->in_callback_fn = true; + return 0; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -7294,6 +7373,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn reg_type_str(env, regs[BPF_REG_1].type)); return -EACCES; } + break; + case BPF_FUNC_delayed_work_submit: + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_delayed_work_callback_state); + break; } if (err) @@ -7468,6 +7552,21 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack) env->prog->call_get_stack = true; + if (func_id == BPF_FUNC_delayed_work_submit) { + struct bpf_delayed_irq_work *irq_work = kmalloc( + sizeof(struct bpf_delayed_irq_work), GFP_KERNEL); + if (!irq_work) { + verbose(env, "could not allocate irq_work"); + return -ENOMEM; + } + + init_llist_head(&irq_work->items); + irq_work->work = IRQ_WORK_INIT_HARD(&bpf_delayed_work_irq_work_cb); + irq_work->prog = env->prog; + env->prog->aux->irq_work = irq_work; + } + + if (func_id == BPF_FUNC_get_func_ip) { if (check_get_func_ip(env)) return -ENOTSUPP; @@ -14061,6 +14160,28 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto patch_call_imm; } + if (insn->imm == BPF_FUNC_delayed_work_submit) { + // Add aux as the 5th arg to delayed_work_submit + struct bpf_insn ld_addrs[2] = { + BPF_LD_IMM64(BPF_REG_5, (long)prog->aux), + }; + + insn_buf[0] = ld_addrs[0]; + insn_buf[1] = ld_addrs[1]; + insn_buf[2] = *insn; + cnt = 3; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto patch_call_imm; + } + + if (insn->imm == BPF_FUNC_task_storage_get || insn->imm == BPF_FUNC_sk_storage_get || insn->imm == BPF_FUNC_inode_storage_get) { diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index a0ec321469bd..0dd43dc9f388 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -637,6 +637,7 @@ class PrinterHelpers(Printer): 'struct bpf_dynptr', 'struct iphdr', 'struct ipv6hdr', + 'struct bpf_delayed_work', ] known_types = { '...', @@ -690,6 +691,7 @@ class PrinterHelpers(Printer): 'struct bpf_dynptr', 'struct iphdr', 'struct ipv6hdr', + 'struct bpf_delayed_work', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d68fc4f472f1..461417159106 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5325,6 +5325,28 @@ union bpf_attr { * **-EACCES** if the SYN cookie is not valid. * * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + * + * long bpf_delayed_work_submit(struct bpf_delayed_work *work, void *cb, void *data, int flags) + * Description + * Submits a function to execute in a different context. + * + * *work* must be a member in a map value. + * + * *cb* function to call + * + * *data* context to pass as sole argument to *cb*. Must be part of + * a map value or NULL. + * + * *flags* must be BPF_DELAYED_WORK_IRQWORK + * Return + * 0 when work is successfully submitted. + * + * **-EINVAL** if *cb* is NULL + * + * **-EOPNOTSUP** if called from an NMI handler on an + * architecture without NMI-safe cmpxchg + * + * **-EINVAL** if *work* is already in use */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5535,6 +5557,7 @@ union bpf_attr { FN(tcp_raw_gen_syncookie_ipv6), \ FN(tcp_raw_check_syncookie_ipv4), \ FN(tcp_raw_check_syncookie_ipv6), \ + FN(delayed_work_submit), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -6699,6 +6722,10 @@ struct bpf_delayed_work { __u64 :64; } __attribute__((aligned(8))); +enum { + BPF_DELAYED_WORK_IRQWORK = (1UL << 0), +}; + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. -- 2.36.1