uprobes work by raising a trap, setting a task flag from within the interrupt handler, and processing the actual work for the uprobe on the way back to userspace. As a result, uprobe handlers already execute in a user context. The primary obstacle to sleepable bpf uprobe programs is therefore on the bpf side. Namely, the bpf_prog_array attached to the uprobe is protected by normal rcu and runs with disabled preemption. In order for uprobe bpf programs to become actually sleepable, we need it to be protected by the tasks_trace rcu flavor instead (and kfree() called after a corresponding grace period). Based on Alexei's proposal, we change the free path for bpf_prog_array to chain a tasks_trace and normal grace periods one after the other. Users who iterate under tasks_trace read section would be safe, as would users who iterate under normal read sections (from non-sleepable locations). The downside is that we take the tasks_trace latency for all perf_event-attached bpf programs (and not just uprobe ones) but this is deemed safe given the possible attach rates for kprobe/uprobe/tp programs. Separately, non-sleepable programs need access to dynamically sized rcu-protected maps, so we conditionally disable preemption and take an rcu read section around them, in addition to the overarching tasks_trace section. Signed-off-by: Delyan Kratunov <delyank@xxxxxx> --- include/linux/bpf.h | 57 ++++++++++++++++++++++++++++++++++++ include/linux/trace_events.h | 1 + kernel/bpf/core.c | 15 ++++++++++ kernel/trace/bpf_trace.c | 27 +++++++++++++++-- kernel/trace/trace_uprobe.c | 4 +-- 5 files changed, 99 insertions(+), 5 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 57ec619cf729..592886115011 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -26,6 +26,7 @@ #include <linux/stddef.h> #include <linux/bpfptr.h> #include <linux/btf.h> +#include <linux/rcupdate_trace.h> struct bpf_verifier_env; struct bpf_verifier_log; @@ -1343,6 +1344,8 @@ extern struct bpf_empty_prog_array bpf_empty_prog_array; struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array *progs); +/* Use when traversal over the bpf_prog_array uses tasks_trace rcu */ +void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs); int bpf_prog_array_length(struct bpf_prog_array *progs); bool bpf_prog_array_is_empty(struct bpf_prog_array *array); int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs, @@ -1428,6 +1431,60 @@ bpf_prog_run_array(const struct bpf_prog_array *array, return ret; } +/** + * Notes on RCU design for bpf_prog_arrays containing sleepable programs: + * + * We use the tasks_trace rcu flavor read section to protect the bpf_prog_array + * overall. As a result, we must use the bpf_prog_array_free_sleepable + * in order to use the tasks_trace rcu grace period. + * + * When a non-sleepable program is inside the array, we take the rcu read + * section and disable preemption for that program alone, so it can access + * rcu-protected dynamically sized maps. + */ +static __always_inline u32 +bpf_prog_run_array_sleepable(const struct bpf_prog_array __rcu *array_rcu, + const void *ctx, bpf_prog_run_fn run_prog) +{ + const struct bpf_prog_array_item *item; + const struct bpf_prog *prog; + const struct bpf_prog_array *array; + struct bpf_run_ctx *old_run_ctx; + struct bpf_trace_run_ctx run_ctx; + u32 ret = 1; + + might_fault(); + + migrate_disable(); + rcu_read_lock_trace(); + + array = rcu_dereference_check(array_rcu, rcu_read_lock_trace_held()); + if (unlikely(!array)) + goto out; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + item = &array->items[0]; + while ((prog = READ_ONCE(item->prog))) { + if (!prog->aux->sleepable) { + preempt_disable(); + rcu_read_lock(); + } + + run_ctx.bpf_cookie = item->bpf_cookie; + ret &= run_prog(prog, ctx); + item++; + + if (!prog->aux->sleepable) { + rcu_read_unlock(); + preempt_enable(); + } + } + bpf_reset_run_ctx(old_run_ctx); +out: + rcu_read_unlock_trace(); + migrate_enable(); + return ret; +} + #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); extern struct mutex bpf_stats_enabled_mutex; diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index e6e95a9f07a5..d45889f1210d 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -736,6 +736,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file) #ifdef CONFIG_BPF_EVENTS unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); +unsigned int uprobe_call_bpf(struct trace_event_call *call, void *ctx); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 13e9dbeeedf3..9271b708807a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2268,6 +2268,21 @@ void bpf_prog_array_free(struct bpf_prog_array *progs) kfree_rcu(progs, rcu); } +static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu) +{ + struct bpf_prog_array *progs; + + progs = container_of(rcu, struct bpf_prog_array, rcu); + kfree_rcu(progs, rcu); +} + +void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs) +{ + if (!progs || progs == &bpf_empty_prog_array.hdr) + return; + call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb); +} + int bpf_prog_array_length(struct bpf_prog_array *array) { struct bpf_prog_array_item *item; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f15b826f9899..582a6171e096 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -140,6 +140,29 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) return ret; } +unsigned int uprobe_call_bpf(struct trace_event_call *call, void *ctx) +{ + unsigned int ret; + + /* + * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock + * to all call sites, we did a bpf_prog_array_valid() there to check + * whether call->prog_array is empty or not, which is + * a heuristic to speed up execution. + * + * If bpf_prog_array_valid() fetched prog_array was + * non-NULL, we go into uprobe_call_bpf() and do the actual + * proper rcu_dereference() under RCU trace lock. + * If it turns out that prog_array is NULL then, we bail out. + * For the opposite, if the bpf_prog_array_valid() fetched pointer + * was NULL, you'll skip the prog_array with the risk of missing + * out of events when it was updated in between this and the + * rcu_dereference() which is accepted risk. + */ + ret = bpf_prog_run_array_sleepable(call->prog_array, ctx, bpf_prog_run); + return ret; +} + #ifdef CONFIG_BPF_KPROBE_OVERRIDE BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) { @@ -1915,7 +1938,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event, event->prog = prog; event->bpf_cookie = bpf_cookie; rcu_assign_pointer(event->tp_event->prog_array, new_array); - bpf_prog_array_free(old_array); + bpf_prog_array_free_sleepable(old_array); unlock: mutex_unlock(&bpf_event_mutex); @@ -1941,7 +1964,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) bpf_prog_array_delete_safe(old_array, event->prog); } else { rcu_assign_pointer(event->tp_event->prog_array, new_array); - bpf_prog_array_free(old_array); + bpf_prog_array_free_sleepable(old_array); } bpf_prog_put(event->prog); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9711589273cd..3eb48897d15b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1346,9 +1346,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, if (bpf_prog_array_valid(call)) { u32 ret; - preempt_disable(); - ret = trace_call_bpf(call, regs); - preempt_enable(); + ret = uprobe_call_bpf(call, regs); if (!ret) return; } -- 2.35.1