Patch "bpf: support deferring bpf_link dealloc to after RCU grace period" has been added to the 6.8-stable tree

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This is a note to let you know that I've just added the patch titled

    bpf: support deferring bpf_link dealloc to after RCU grace period

to the 6.8-stable tree which can be found at:
    http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary

The filename of the patch is:
     bpf-support-deferring-bpf_link-dealloc-to-after-rcu-.patch
and it can be found in the queue-6.8 subdirectory.

If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@xxxxxxxxxxxxxxx> know about it.



commit a37ac5c2b029abcd8bbd8fa268ad0d3004da8b23
Author: Andrii Nakryiko <andrii@xxxxxxxxxx>
Date:   Wed Mar 27 22:24:26 2024 -0700

    bpf: support deferring bpf_link dealloc to after RCU grace period
    
    [ Upstream commit 1a80dbcb2dbaf6e4c216e62e30fa7d3daa8001ce ]
    
    BPF link for some program types is passed as a "context" which can be
    used by those BPF programs to look up additional information. E.g., for
    multi-kprobes and multi-uprobes, link is used to fetch BPF cookie values.
    
    Because of this runtime dependency, when bpf_link refcnt drops to zero
    there could still be active BPF programs running accessing link data.
    
    This patch adds generic support to defer bpf_link dealloc callback to
    after RCU GP, if requested. This is done by exposing two different
    deallocation callbacks, one synchronous and one deferred. If deferred
    one is provided, bpf_link_free() will schedule dealloc_deferred()
    callback to happen after RCU GP.
    
    BPF is using two flavors of RCU: "classic" non-sleepable one and RCU
    tasks trace one. The latter is used when sleepable BPF programs are
    used. bpf_link_free() accommodates that by checking underlying BPF
    program's sleepable flag, and goes either through normal RCU GP only for
    non-sleepable, or through RCU tasks trace GP *and* then normal RCU GP
    (taking into account rcu_trace_implies_rcu_gp() optimization), if BPF
    program is sleepable.
    
    We use this for multi-kprobe and multi-uprobe links, which dereference
    link during program run. We also preventively switch raw_tp link to use
    deferred dealloc callback, as upcoming changes in bpf-next tree expose
    raw_tp link data (specifically, cookie value) to BPF program at runtime
    as well.
    
    Fixes: 0dcac2725406 ("bpf: Add multi kprobe link")
    Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link")
    Reported-by: syzbot+981935d9485a560bfbcb@xxxxxxxxxxxxxxxxxxxxxxxxx
    Reported-by: syzbot+2cb5a6c573e98db598cc@xxxxxxxxxxxxxxxxxxxxxxxxx
    Reported-by: syzbot+62d8b26793e8a2bd0516@xxxxxxxxxxxxxxxxxxxxxxxxx
    Signed-off-by: Andrii Nakryiko <andrii@xxxxxxxxxx>
    Acked-by: Jiri Olsa <jolsa@xxxxxxxxxx>
    Link: https://lore.kernel.org/r/20240328052426.3042617-2-andrii@xxxxxxxxxx
    Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxx>
    Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1b6b590451284..893a7ec57bf25 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1568,12 +1568,26 @@ struct bpf_link {
 	enum bpf_link_type type;
 	const struct bpf_link_ops *ops;
 	struct bpf_prog *prog;
-	struct work_struct work;
+	/* rcu is used before freeing, work can be used to schedule that
+	 * RCU-based freeing before that, so they never overlap
+	 */
+	union {
+		struct rcu_head rcu;
+		struct work_struct work;
+	};
 };
 
 struct bpf_link_ops {
 	void (*release)(struct bpf_link *link);
+	/* deallocate link resources callback, called without RCU grace period
+	 * waiting
+	 */
 	void (*dealloc)(struct bpf_link *link);
+	/* deallocate link resources callback, called after RCU grace period;
+	 * if underlying BPF program is sleepable we go through tasks trace
+	 * RCU GP and then "classic" RCU GP
+	 */
+	void (*dealloc_deferred)(struct bpf_link *link);
 	int (*detach)(struct bpf_link *link);
 	int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog,
 			   struct bpf_prog *old_prog);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 11a8ea854c1d5..83ec7f788a638 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2895,17 +2895,46 @@ void bpf_link_inc(struct bpf_link *link)
 	atomic64_inc(&link->refcnt);
 }
 
+static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu)
+{
+	struct bpf_link *link = container_of(rcu, struct bpf_link, rcu);
+
+	/* free bpf_link and its containing memory */
+	link->ops->dealloc_deferred(link);
+}
+
+static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
+{
+	if (rcu_trace_implies_rcu_gp())
+		bpf_link_defer_dealloc_rcu_gp(rcu);
+	else
+		call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp);
+}
+
 /* bpf_link_free is guaranteed to be called from process context */
 static void bpf_link_free(struct bpf_link *link)
 {
+	bool sleepable = false;
+
 	bpf_link_free_id(link->id);
 	if (link->prog) {
+		sleepable = link->prog->sleepable;
 		/* detach BPF program, clean up used resources */
 		link->ops->release(link);
 		bpf_prog_put(link->prog);
 	}
-	/* free bpf_link and its containing memory */
-	link->ops->dealloc(link);
+	if (link->ops->dealloc_deferred) {
+		/* schedule BPF link deallocation; if underlying BPF program
+		 * is sleepable, we need to first wait for RCU tasks trace
+		 * sync, then go through "classic" RCU grace period
+		 */
+		if (sleepable)
+			call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
+		else
+			call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
+	}
+	if (link->ops->dealloc)
+		link->ops->dealloc(link);
 }
 
 static void bpf_link_put_deferred(struct work_struct *work)
@@ -3415,7 +3444,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
 
 static const struct bpf_link_ops bpf_raw_tp_link_lops = {
 	.release = bpf_raw_tp_link_release,
-	.dealloc = bpf_raw_tp_link_dealloc,
+	.dealloc_deferred = bpf_raw_tp_link_dealloc,
 	.show_fdinfo = bpf_raw_tp_link_show_fdinfo,
 	.fill_link_info = bpf_raw_tp_link_fill_link_info,
 };
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 22d555a42404f..c7f9236eed628 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2713,7 +2713,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
 
 static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
 	.release = bpf_kprobe_multi_link_release,
-	.dealloc = bpf_kprobe_multi_link_dealloc,
+	.dealloc_deferred = bpf_kprobe_multi_link_dealloc,
 	.fill_link_info = bpf_kprobe_multi_link_fill_link_info,
 };
 
@@ -3227,7 +3227,7 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
 
 static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
 	.release = bpf_uprobe_multi_link_release,
-	.dealloc = bpf_uprobe_multi_link_dealloc,
+	.dealloc_deferred = bpf_uprobe_multi_link_dealloc,
 	.fill_link_info = bpf_uprobe_multi_link_fill_link_info,
 };
 




[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux