The grace period used internally within tracepoint.c:release_probes() uses call_rcu() to batch waiting for quiescence of old probe arrays, rather than using the tracepoint_synchronize_unregister() which blocks while waiting for quiescence. With the introduction of faultable syscall tracepoints, this causes use-after-free issues reproduced with syzkaller. Fix this by introducing tracepoint_call_rcu(), which uses the appropriate call_rcu() or call_rcu_tasks_trace() before invoking the rcu_free_old_probes callback. Use tracepoint_call_rcu() in bpf_link_free() for raw tracepoints as well, which has the same problem for syscall tracepoints. Reported-by: syzbot+b390c8062d8387b6272a@xxxxxxxxxxxxxxxxxxxxxxxxx Fixes: a363d27cdbc2 ("tracing: Allow system call tracepoints to handle page faults") Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx> Cc: Michael Jeanson <mjeanson@xxxxxxxxxxxx> Cc: Steven Rostedt <rostedt@xxxxxxxxxxx> Cc: Masami Hiramatsu <mhiramat@xxxxxxxxxx> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> Cc: Alexei Starovoitov <ast@xxxxxxxxxx> Cc: Yonghong Song <yhs@xxxxxx> Cc: Paul E. McKenney <paulmck@xxxxxxxxxx> Cc: Ingo Molnar <mingo@xxxxxxxxxx> Cc: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx> Cc: Mark Rutland <mark.rutland@xxxxxxx> Cc: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx> Cc: Namhyung Kim <namhyung@xxxxxxxxxx> Cc: Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> Cc: bpf@xxxxxxxxxxxxxxx Cc: Joel Fernandes <joel@xxxxxxxxxxxxxxxxx> Cc: Jordan Rife <jrife@xxxxxxxxxx> --- Changes since v0: - Introduce tracepoint_call_rcu(), - Fix bpf_link_free() use of call_rcu as well. --- include/linux/tracepoint.h | 9 +++++++++ kernel/bpf/syscall.c | 23 +++++++++++++++-------- kernel/tracepoint.c | 22 ++++++++++++++++++---- 3 files changed, 42 insertions(+), 12 deletions(-) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 0dc67fad706c..45025d6b2dd6 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -104,6 +104,8 @@ void for_each_tracepoint_in_module(struct module *mod, * tracepoint_synchronize_unregister must be called between the last tracepoint * probe unregistration and the end of module exit to make sure there is no * caller executing a probe when it is freed. + * An alternative to tracepoint_synchronize_unregister() is to use + * tracepoint_call_rcu() for batched reclaim. */ #ifdef CONFIG_TRACEPOINTS static inline void tracepoint_synchronize_unregister(void) @@ -111,9 +113,16 @@ static inline void tracepoint_synchronize_unregister(void) synchronize_rcu_tasks_trace(); synchronize_rcu(); } + +void tracepoint_call_rcu(struct tracepoint *tp, struct rcu_head *head, + void (*callback)(struct rcu_head *head)); + #else static inline void tracepoint_synchronize_unregister(void) { } +static inline void tracepoint_call_rcu(struct tracepoint *tp, struct rcu_head *head, + void (*callback)(struct rcu_head *head)) +{ } #endif #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 59de664e580d..1191dc1d4206 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3006,14 +3006,21 @@ static void bpf_link_free(struct bpf_link *link) bpf_prog_put(link->prog); } if (ops->dealloc_deferred) { - /* schedule BPF link deallocation; if underlying BPF program - * is sleepable, we need to first wait for RCU tasks trace - * sync, then go through "classic" RCU grace period - */ - if (sleepable) - call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); - else - call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); + if (link->type == BPF_LINK_TYPE_RAW_TRACEPOINT) { + struct bpf_raw_tp_link *raw_tp = + container_of(link, struct bpf_raw_tp_link, link); + + tracepoint_call_rcu(raw_tp->btp->tp, &link->rcu, bpf_link_defer_dealloc_rcu_gp); + } else { + /* schedule BPF link deallocation; if underlying BPF program + * is sleepable, we need to first wait for RCU tasks trace + * sync, then go through "classic" RCU grace period + */ + if (sleepable) + call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); + else + call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); + } } else if (ops->dealloc) ops->dealloc(link); } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 6474e2cf22c9..ef60c5484eda 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -106,13 +106,27 @@ static void rcu_free_old_probes(struct rcu_head *head) kfree(container_of(head, struct tp_probes, rcu)); } -static inline void release_probes(struct tracepoint_func *old) +static bool tracepoint_is_syscall(struct tracepoint *tp) +{ + return !strcmp(tp->name, "sys_enter") || !strcmp(tp->name, "sys_exit"); +} + +void tracepoint_call_rcu(struct tracepoint *tp, struct rcu_head *head, + void (*callback)(struct rcu_head *head)) +{ + if (tracepoint_is_syscall(tp)) + call_rcu_tasks_trace(head, callback); + else + call_rcu(head, callback); +} + +static inline void release_probes(struct tracepoint *tp, struct tracepoint_func *old) { if (old) { struct tp_probes *tp_probes = container_of(old, struct tp_probes, probes[0]); - call_rcu(&tp_probes->rcu, rcu_free_old_probes); + tracepoint_call_rcu(tp, &tp_probes->rcu, rcu_free_old_probes); } } @@ -334,7 +348,7 @@ static int tracepoint_add_func(struct tracepoint *tp, break; } - release_probes(old); + release_probes(tp, old); return 0; } @@ -406,7 +420,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, WARN_ON_ONCE(1); break; } - release_probes(old); + release_probes(tp, old); return 0; } -- 2.39.5