On Mon, May 02, 2022 at 11:09:37PM +0000, Delyan Kratunov wrote: > uprobes work by raising a trap, setting a task flag from within the > interrupt handler, and processing the actual work for the uprobe on the > way back to userspace. As a result, uprobe handlers already execute in a > user context. The primary obstacle to sleepable bpf uprobe programs is > therefore on the bpf side. > > Namely, the bpf_prog_array attached to the uprobe is protected by normal > rcu and runs with disabled preemption. disabled preemption was an artifact of the past. It became unnecessary when migrate_disable was introduced. Looks like we forgot to update this spot. > In order for uprobe bpf programs > to become actually sleepable, we need it to be protected by the tasks_trace > rcu flavor instead (and kfree() called after a corresponding grace period). > > Based on Alexei's proposal, we change the free path for bpf_prog_array to > chain a tasks_trace and normal grace periods one after the other. > > Users who iterate under tasks_trace read section would > be safe, as would users who iterate under normal read sections (from > non-sleepable locations). The downside is that we take the tasks_trace latency > for all perf_event-attached bpf programs (and not just uprobe ones) > but this is deemed safe given the possible attach rates for > kprobe/uprobe/tp programs. > > Separately, non-sleepable programs need access to dynamically sized > rcu-protected maps, so we conditionally disable preemption and take an rcu > read section around them, in addition to the overarching tasks_trace section. > > Signed-off-by: Delyan Kratunov <delyank@xxxxxx> > --- > include/linux/bpf.h | 57 ++++++++++++++++++++++++++++++++++++ > include/linux/trace_events.h | 1 + > kernel/bpf/core.c | 15 ++++++++++ > kernel/trace/bpf_trace.c | 27 +++++++++++++++-- > kernel/trace/trace_uprobe.c | 4 +-- > 5 files changed, 99 insertions(+), 5 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 57ec619cf729..592886115011 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -26,6 +26,7 @@ > #include <linux/stddef.h> > #include <linux/bpfptr.h> > #include <linux/btf.h> > +#include <linux/rcupdate_trace.h> > > struct bpf_verifier_env; > struct bpf_verifier_log; > @@ -1343,6 +1344,8 @@ extern struct bpf_empty_prog_array bpf_empty_prog_array; > > struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); > void bpf_prog_array_free(struct bpf_prog_array *progs); > +/* Use when traversal over the bpf_prog_array uses tasks_trace rcu */ > +void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs); > int bpf_prog_array_length(struct bpf_prog_array *progs); > bool bpf_prog_array_is_empty(struct bpf_prog_array *array); > int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs, > @@ -1428,6 +1431,60 @@ bpf_prog_run_array(const struct bpf_prog_array *array, > return ret; > } > > +/** > + * Notes on RCU design for bpf_prog_arrays containing sleepable programs: > + * > + * We use the tasks_trace rcu flavor read section to protect the bpf_prog_array > + * overall. As a result, we must use the bpf_prog_array_free_sleepable > + * in order to use the tasks_trace rcu grace period. > + * > + * When a non-sleepable program is inside the array, we take the rcu read > + * section and disable preemption for that program alone, so it can access > + * rcu-protected dynamically sized maps. > + */ > +static __always_inline u32 > +bpf_prog_run_array_sleepable(const struct bpf_prog_array __rcu *array_rcu, > + const void *ctx, bpf_prog_run_fn run_prog) > +{ > + const struct bpf_prog_array_item *item; > + const struct bpf_prog *prog; > + const struct bpf_prog_array *array; > + struct bpf_run_ctx *old_run_ctx; > + struct bpf_trace_run_ctx run_ctx; > + u32 ret = 1; > + > + might_fault(); > + > + migrate_disable(); > + rcu_read_lock_trace(); pls swap this order to make it the same as bpf trampoline. > + > + array = rcu_dereference_check(array_rcu, rcu_read_lock_trace_held()); > + if (unlikely(!array)) > + goto out; > + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); > + item = &array->items[0]; > + while ((prog = READ_ONCE(item->prog))) { > + if (!prog->aux->sleepable) { > + preempt_disable(); just remove this line. > + rcu_read_lock(); In config-s where rcu_read_lock/unlock is a nop this whole 'if' and one below will be optimized out by the compiler. Which is nice. > + } > + > + run_ctx.bpf_cookie = item->bpf_cookie; > + ret &= run_prog(prog, ctx); > + item++; > + > + if (!prog->aux->sleepable) { > + rcu_read_unlock(); > + preempt_enable(); > + } > + } > + bpf_reset_run_ctx(old_run_ctx); > +out: > + rcu_read_unlock_trace(); > + migrate_enable(); > + return ret; > +} > + > #ifdef CONFIG_BPF_SYSCALL > DECLARE_PER_CPU(int, bpf_prog_active); > extern struct mutex bpf_stats_enabled_mutex; > diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h > index e6e95a9f07a5..d45889f1210d 100644 > --- a/include/linux/trace_events.h > +++ b/include/linux/trace_events.h > @@ -736,6 +736,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file) > > #ifdef CONFIG_BPF_EVENTS > unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); > +unsigned int uprobe_call_bpf(struct trace_event_call *call, void *ctx); > int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); > void perf_event_detach_bpf_prog(struct perf_event *event); > int perf_event_query_prog_array(struct perf_event *event, void __user *info); > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c > index 13e9dbeeedf3..9271b708807a 100644 > --- a/kernel/bpf/core.c > +++ b/kernel/bpf/core.c > @@ -2268,6 +2268,21 @@ void bpf_prog_array_free(struct bpf_prog_array *progs) > kfree_rcu(progs, rcu); > } > > +static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu) > +{ > + struct bpf_prog_array *progs; > + > + progs = container_of(rcu, struct bpf_prog_array, rcu); > + kfree_rcu(progs, rcu); > +} > + > +void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs) > +{ > + if (!progs || progs == &bpf_empty_prog_array.hdr) > + return; > + call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb); > +} > + > int bpf_prog_array_length(struct bpf_prog_array *array) > { > struct bpf_prog_array_item *item; > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c > index f15b826f9899..582a6171e096 100644 > --- a/kernel/trace/bpf_trace.c > +++ b/kernel/trace/bpf_trace.c > @@ -140,6 +140,29 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) > return ret; > } > > +unsigned int uprobe_call_bpf(struct trace_event_call *call, void *ctx) > +{ > + unsigned int ret; > + > + /* > + * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock > + * to all call sites, we did a bpf_prog_array_valid() there to check > + * whether call->prog_array is empty or not, which is > + * a heuristic to speed up execution. > + * > + * If bpf_prog_array_valid() fetched prog_array was > + * non-NULL, we go into uprobe_call_bpf() and do the actual > + * proper rcu_dereference() under RCU trace lock. > + * If it turns out that prog_array is NULL then, we bail out. > + * For the opposite, if the bpf_prog_array_valid() fetched pointer > + * was NULL, you'll skip the prog_array with the risk of missing > + * out of events when it was updated in between this and the > + * rcu_dereference() which is accepted risk. > + */ > + ret = bpf_prog_run_array_sleepable(call->prog_array, ctx, bpf_prog_run); > + return ret; > +} > + > #ifdef CONFIG_BPF_KPROBE_OVERRIDE > BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) > { > @@ -1915,7 +1938,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event, > event->prog = prog; > event->bpf_cookie = bpf_cookie; > rcu_assign_pointer(event->tp_event->prog_array, new_array); > - bpf_prog_array_free(old_array); > + bpf_prog_array_free_sleepable(old_array); > > unlock: > mutex_unlock(&bpf_event_mutex); > @@ -1941,7 +1964,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) > bpf_prog_array_delete_safe(old_array, event->prog); > } else { > rcu_assign_pointer(event->tp_event->prog_array, new_array); > - bpf_prog_array_free(old_array); > + bpf_prog_array_free_sleepable(old_array); > } > > bpf_prog_put(event->prog); > diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c > index 9711589273cd..3eb48897d15b 100644 > --- a/kernel/trace/trace_uprobe.c > +++ b/kernel/trace/trace_uprobe.c > @@ -1346,9 +1346,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, > if (bpf_prog_array_valid(call)) { > u32 ret; > > - preempt_disable(); It should have been replaced with migrate_disable long ago. > - ret = trace_call_bpf(call, regs); > - preempt_enable(); > + ret = uprobe_call_bpf(call, regs); > if (!ret) > return; > } > -- > 2.35.1