Re: [PATCH bpf-next v2 2/5] bpf: implement sleepable uprobes by chaining tasks_trace and normal rcu

Alexei Starovoitov <alexei.starovoitov@xxxxxxxxx> · Mon, 9 May 2022 20:02:36 -0700

On Mon, May 02, 2022 at 11:09:37PM +0000, Delyan Kratunov wrote:
> uprobes work by raising a trap, setting a task flag from within the
> interrupt handler, and processing the actual work for the uprobe on the
> way back to userspace. As a result, uprobe handlers already execute in a
> user context. The primary obstacle to sleepable bpf uprobe programs is
> therefore on the bpf side.
> 
> Namely, the bpf_prog_array attached to the uprobe is protected by normal
> rcu and runs with disabled preemption.

disabled preemption was an artifact of the past.
It became unnecessary when migrate_disable was introduced.
Looks like we forgot to update this spot.

> In order for uprobe bpf programs
> to become actually sleepable, we need it to be protected by the tasks_trace
> rcu flavor instead (and kfree() called after a corresponding grace period).
> 
> Based on Alexei's proposal, we change the free path for bpf_prog_array to
> chain a tasks_trace and normal grace periods one after the other.
> 
> Users who iterate under tasks_trace read section would
> be safe, as would users who iterate under normal read sections (from
> non-sleepable locations). The downside is that we take the tasks_trace latency
> for all perf_event-attached bpf programs (and not just uprobe ones)
> but this is deemed safe given the possible attach rates for
> kprobe/uprobe/tp programs.
> 
> Separately, non-sleepable programs need access to dynamically sized
> rcu-protected maps, so we conditionally disable preemption and take an rcu
> read section around them, in addition to the overarching tasks_trace section.
> 
> Signed-off-by: Delyan Kratunov <delyank@xxxxxx>
> ---
>  include/linux/bpf.h          | 57 ++++++++++++++++++++++++++++++++++++
>  include/linux/trace_events.h |  1 +
>  kernel/bpf/core.c            | 15 ++++++++++
>  kernel/trace/bpf_trace.c     | 27 +++++++++++++++--
>  kernel/trace/trace_uprobe.c  |  4 +--
>  5 files changed, 99 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 57ec619cf729..592886115011 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -26,6 +26,7 @@
>  #include <linux/stddef.h>
>  #include <linux/bpfptr.h>
>  #include <linux/btf.h>
> +#include <linux/rcupdate_trace.h>
> 
>  struct bpf_verifier_env;
>  struct bpf_verifier_log;
> @@ -1343,6 +1344,8 @@ extern struct bpf_empty_prog_array bpf_empty_prog_array;
> 
>  struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
>  void bpf_prog_array_free(struct bpf_prog_array *progs);
> +/* Use when traversal over the bpf_prog_array uses tasks_trace rcu */
> +void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs);
>  int bpf_prog_array_length(struct bpf_prog_array *progs);
>  bool bpf_prog_array_is_empty(struct bpf_prog_array *array);
>  int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
> @@ -1428,6 +1431,60 @@ bpf_prog_run_array(const struct bpf_prog_array *array,
>  	return ret;
>  }
> 
> +/**
> + * Notes on RCU design for bpf_prog_arrays containing sleepable programs:
> + *
> + * We use the tasks_trace rcu flavor read section to protect the bpf_prog_array
> + * overall. As a result, we must use the bpf_prog_array_free_sleepable
> + * in order to use the tasks_trace rcu grace period.
> + *
> + * When a non-sleepable program is inside the array, we take the rcu read
> + * section and disable preemption for that program alone, so it can access
> + * rcu-protected dynamically sized maps.
> + */
> +static __always_inline u32
> +bpf_prog_run_array_sleepable(const struct bpf_prog_array __rcu *array_rcu,
> +			     const void *ctx, bpf_prog_run_fn run_prog)
> +{
> +	const struct bpf_prog_array_item *item;
> +	const struct bpf_prog *prog;
> +	const struct bpf_prog_array *array;
> +	struct bpf_run_ctx *old_run_ctx;
> +	struct bpf_trace_run_ctx run_ctx;
> +	u32 ret = 1;
> +
> +	might_fault();
> +
> +	migrate_disable();
> +	rcu_read_lock_trace();

pls swap this order to make it the same as bpf trampoline.

> +
> +	array = rcu_dereference_check(array_rcu, rcu_read_lock_trace_held());
> +	if (unlikely(!array))
> +		goto out;
> +	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
> +	item = &array->items[0];
> +	while ((prog = READ_ONCE(item->prog))) {
> +		if (!prog->aux->sleepable) {
> +			preempt_disable();

just remove this line.

> +			rcu_read_lock();

In config-s where rcu_read_lock/unlock is a nop
this whole 'if' and one below will be optimized out by the compiler.
Which is nice.

> +		}
> +
> +		run_ctx.bpf_cookie = item->bpf_cookie;
> +		ret &= run_prog(prog, ctx);
> +		item++;
> +
> +		if (!prog->aux->sleepable) {
> +			rcu_read_unlock();
> +			preempt_enable();
> +		}
> +	}
> +	bpf_reset_run_ctx(old_run_ctx);
> +out:
> +	rcu_read_unlock_trace();
> +	migrate_enable();
> +	return ret;
> +}
> +
>  #ifdef CONFIG_BPF_SYSCALL
>  DECLARE_PER_CPU(int, bpf_prog_active);
>  extern struct mutex bpf_stats_enabled_mutex;
> diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
> index e6e95a9f07a5..d45889f1210d 100644
> --- a/include/linux/trace_events.h
> +++ b/include/linux/trace_events.h
> @@ -736,6 +736,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
> 
>  #ifdef CONFIG_BPF_EVENTS
>  unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
> +unsigned int uprobe_call_bpf(struct trace_event_call *call, void *ctx);
>  int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie);
>  void perf_event_detach_bpf_prog(struct perf_event *event);
>  int perf_event_query_prog_array(struct perf_event *event, void __user *info);
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 13e9dbeeedf3..9271b708807a 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -2268,6 +2268,21 @@ void bpf_prog_array_free(struct bpf_prog_array *progs)
>  	kfree_rcu(progs, rcu);
>  }
> 
> +static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
> +{
> +	struct bpf_prog_array *progs;
> +
> +	progs = container_of(rcu, struct bpf_prog_array, rcu);
> +	kfree_rcu(progs, rcu);
> +}
> +
> +void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
> +{
> +	if (!progs || progs == &bpf_empty_prog_array.hdr)
> +		return;
> +	call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
> +}
> +
>  int bpf_prog_array_length(struct bpf_prog_array *array)
>  {
>  	struct bpf_prog_array_item *item;
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index f15b826f9899..582a6171e096 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -140,6 +140,29 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
>  	return ret;
>  }
> 
> +unsigned int uprobe_call_bpf(struct trace_event_call *call, void *ctx)
> +{
> +	unsigned int ret;
> +
> +	/*
> +	 * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
> +	 * to all call sites, we did a bpf_prog_array_valid() there to check
> +	 * whether call->prog_array is empty or not, which is
> +	 * a heuristic to speed up execution.
> +	 *
> +	 * If bpf_prog_array_valid() fetched prog_array was
> +	 * non-NULL, we go into uprobe_call_bpf() and do the actual
> +	 * proper rcu_dereference() under RCU trace lock.
> +	 * If it turns out that prog_array is NULL then, we bail out.
> +	 * For the opposite, if the bpf_prog_array_valid() fetched pointer
> +	 * was NULL, you'll skip the prog_array with the risk of missing
> +	 * out of events when it was updated in between this and the
> +	 * rcu_dereference() which is accepted risk.
> +	 */
> +	ret = bpf_prog_run_array_sleepable(call->prog_array, ctx, bpf_prog_run);
> +	return ret;
> +}
> +
>  #ifdef CONFIG_BPF_KPROBE_OVERRIDE
>  BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
>  {
> @@ -1915,7 +1938,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
>  	event->prog = prog;
>  	event->bpf_cookie = bpf_cookie;
>  	rcu_assign_pointer(event->tp_event->prog_array, new_array);
> -	bpf_prog_array_free(old_array);
> +	bpf_prog_array_free_sleepable(old_array);
> 
>  unlock:
>  	mutex_unlock(&bpf_event_mutex);
> @@ -1941,7 +1964,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
>  		bpf_prog_array_delete_safe(old_array, event->prog);
>  	} else {
>  		rcu_assign_pointer(event->tp_event->prog_array, new_array);
> -		bpf_prog_array_free(old_array);
> +		bpf_prog_array_free_sleepable(old_array);
>  	}
> 
>  	bpf_prog_put(event->prog);
> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
> index 9711589273cd..3eb48897d15b 100644
> --- a/kernel/trace/trace_uprobe.c
> +++ b/kernel/trace/trace_uprobe.c
> @@ -1346,9 +1346,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
>  	if (bpf_prog_array_valid(call)) {
>  		u32 ret;
> 
> -		preempt_disable();

It should have been replaced with migrate_disable long ago.

> -		ret = trace_call_bpf(call, regs);
> -		preempt_enable();
> +		ret = uprobe_call_bpf(call, regs);
>  		if (!ret)
>  			return;
>  	}
> --
> 2.35.1