Hi Andrii Nakryiko, Greeting! On 2024-03-19 at 16:38:49 -0700, Andrii Nakryiko wrote: > Instead of passing prog as an argument to bpf_trace_runX() helpers, that > are called from tracepoint triggering calls, store BPF link itself > (struct bpf_raw_tp_link for raw tracepoints). This will allow to pass > extra information like BPF cookie into raw tracepoint registration. > > Instead of replacing `struct bpf_prog *prog = __data;` with > corresponding `struct bpf_raw_tp_link *link = __data;` assignment in > `__bpf_trace_##call` I just passed `__data` through into underlying > bpf_trace_runX() call. This works well because we implicitly cast `void *`, > and it also avoids naming clashes with arguments coming from > tracepoint's "proto" list. We could have run into the same problem with > "prog", we just happened to not have a tracepoint that has "prog" input > argument. We are less lucky with "link", as there are tracepoints using > "link" argument name already. So instead of trying to avoid naming > conflicts, let's just remove intermediate local variable. It doesn't > hurt readibility, it's either way a bit of a maze of calls and macros, > that requires careful reading. > > Acked-by: Stanislav Fomichev <sdf@xxxxxxxxxx> > Signed-off-by: Andrii Nakryiko <andrii@xxxxxxxxxx> > --- > include/linux/bpf.h | 5 +++++ > include/linux/trace_events.h | 36 ++++++++++++++++++++---------------- > include/trace/bpf_probe.h | 3 +-- > kernel/bpf/syscall.c | 9 ++------- > kernel/trace/bpf_trace.c | 18 ++++++++++-------- > 5 files changed, 38 insertions(+), 33 deletions(-) > I used syzkaller and test intel internal kernel and found "KASAN: slab-use-after-free Read in bpf_trace_run4" problem. Bisected and found related commit: d4dfc5700e86 bpf: pass whole link instead of prog when triggering raw tracepoint Checked that the commit above is the same as this commit. All detailed info:https://github.com/xupengfe/syzkaller_logs/tree/main/240409_092216_bpf_trace_run4 Syzkaller repro code: https://github.com/xupengfe/syzkaller_logs/blob/main/240409_092216_bpf_trace_run4/repro.c Syzkaller syscall repro steps: https://github.com/xupengfe/syzkaller_logs/blob/main/240409_092216_bpf_trace_run4/repro.prog Kconfig(make olddefconfig): https://github.com/xupengfe/syzkaller_logs/blob/main/240409_092216_bpf_trace_run4/kconfig_origin Bisect info: https://github.com/xupengfe/syzkaller_logs/blob/main/240409_092216_bpf_trace_run4/bisect_info.log issue_bzImage: https://github.com/xupengfe/syzkaller_logs/raw/main/240409_092216_bpf_trace_run4/bzImage_v6.9-rc2_next.tar.gz issue dmesg: https://github.com/xupengfe/syzkaller_logs/blob/main/240409_092216_bpf_trace_run4/5d8569db0cb982d3c630482c285578e98a75fc90_dmesg.log " [ 24.977435] ================================================================== [ 24.978307] BUG: KASAN: slab-use-after-free in bpf_trace_run4+0x547/0x5e0 [ 24.979138] Read of size 8 at addr ffff888015676218 by task rcu_preempt/16 [ 24.979936] [ 24.980152] CPU: 0 PID: 16 Comm: rcu_preempt Not tainted 6.9.0-rc2-5d8569db0cb9+ #1 [ 24.981040] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 [ 24.982352] Call Trace: [ 24.982672] <TASK> [ 24.982952] dump_stack_lvl+0xe9/0x150 [ 24.983449] print_report+0xd0/0x610 [ 24.983904] ? bpf_trace_run4+0x547/0x5e0 [ 24.984393] ? kasan_complete_mode_report_info+0x80/0x200 [ 24.985039] ? bpf_trace_run4+0x547/0x5e0 [ 24.985528] kasan_report+0x9f/0xe0 [ 24.985961] ? bpf_trace_run4+0x547/0x5e0 [ 24.986457] __asan_report_load8_noabort+0x18/0x20 [ 24.987055] bpf_trace_run4+0x547/0x5e0 [ 24.987532] ? __pfx_bpf_trace_run4+0x10/0x10 [ 24.988061] ? __this_cpu_preempt_check+0x20/0x30 [ 24.988670] ? lock_is_held_type+0xe5/0x140 [ 24.989185] ? set_next_entity+0x38c/0x630 [ 24.989698] ? put_prev_entity+0x50/0x1f0 [ 24.990199] __bpf_trace_sched_switch+0x14/0x20 [ 24.990776] __traceiter_sched_switch+0x7a/0xd0 [ 24.991293] __schedule+0xc6d/0x2840 [ 24.991721] ? __pfx___schedule+0x10/0x10 [ 24.992170] ? lock_release+0x3f6/0x790 [ 24.992616] ? __this_cpu_preempt_check+0x20/0x30 [ 24.993140] ? schedule+0x1f3/0x290 [ 24.993536] ? __pfx_lock_release+0x10/0x10 [ 24.994003] ? _raw_spin_unlock_irqrestore+0x39/0x70 [ 24.994561] ? schedule_timeout+0x559/0x940 [ 24.995021] ? __this_cpu_preempt_check+0x20/0x30 [ 24.995548] schedule+0xcf/0x290 [ 24.995922] schedule_timeout+0x55e/0x940 [ 24.996369] ? __pfx_schedule_timeout+0x10/0x10 [ 24.996870] ? prepare_to_swait_event+0xff/0x450 [ 24.997401] ? prepare_to_swait_event+0xc4/0x450 [ 24.997916] ? __this_cpu_preempt_check+0x20/0x30 [ 24.998445] ? __pfx_process_timeout+0x10/0x10 [ 24.998971] ? tcp_get_idx+0xd0/0x270 [ 24.999408] ? prepare_to_swait_event+0xff/0x450 [ 24.999934] rcu_gp_fqs_loop+0x661/0xa70 [ 25.000399] ? __pfx_rcu_gp_fqs_loop+0x10/0x10 [ 25.000913] ? __pfx_rcu_gp_init+0x10/0x10 [ 25.001381] rcu_gp_kthread+0x25e/0x360 [ 25.001822] ? __pfx_rcu_gp_kthread+0x10/0x10 [ 25.002324] ? __sanitizer_cov_trace_const_cmp1+0x1e/0x30 [ 25.002966] ? __kthread_parkme+0x146/0x220 [ 25.003472] ? __pfx_rcu_gp_kthread+0x10/0x10 [ 25.003995] kthread+0x354/0x470 [ 25.004400] ? __pfx_kthread+0x10/0x10 [ 25.004862] ret_from_fork+0x57/0x90 [ 25.005315] ? __pfx_kthread+0x10/0x10 [ 25.005778] ret_from_fork_asm+0x1a/0x30 [ 25.006282] </TASK> [ 25.006560] [ 25.006773] Allocated by task 732: [ 25.007187] kasan_save_stack+0x2a/0x50 [ 25.007660] kasan_save_track+0x18/0x40 [ 25.008124] kasan_save_alloc_info+0x3b/0x50 [ 25.008649] __kasan_kmalloc+0x86/0xa0 [ 25.009107] kmalloc_trace+0x1c5/0x3d0 [ 25.009599] bpf_raw_tp_link_attach+0x28e/0x5a0 [ 25.010163] __sys_bpf+0x452/0x5550 [ 25.010599] __x64_sys_bpf+0x7e/0xc0 [ 25.011054] do_syscall_64+0x73/0x150 [ 25.011523] entry_SYSCALL_64_after_hwframe+0x71/0x79 [ 25.012156] [ 25.012360] Freed by task 732: [ 25.012740] kasan_save_stack+0x2a/0x50 [ 25.013211] kasan_save_track+0x18/0x40 [ 25.013689] kasan_save_free_info+0x3e/0x60 [ 25.014198] __kasan_slab_free+0x107/0x190 [ 25.014694] kfree+0xf3/0x320 [ 25.015085] bpf_raw_tp_link_dealloc+0x1e/0x30 [ 25.015632] bpf_link_free+0x145/0x1b0 [ 25.016094] bpf_link_put_direct+0x45/0x60 [ 25.016593] bpf_link_release+0x40/0x50 [ 25.017064] __fput+0x273/0xb70 [ 25.017489] ____fput+0x1e/0x30 [ 25.017890] task_work_run+0x1a3/0x2d0 [ 25.018356] do_exit+0xad3/0x31b0 [ 25.018800] do_group_exit+0xdf/0x2b0 [ 25.019256] __x64_sys_exit_group+0x47/0x50 [ 25.019763] do_syscall_64+0x73/0x150 [ 25.020215] entry_SYSCALL_64_after_hwframe+0x71/0x79 [ 25.020830] [ 25.021036] The buggy address belongs to the object at ffff888015676200 [ 25.021036] which belongs to the cache kmalloc-128 of size 128 [ 25.022465] The buggy address is located 24 bytes inside of [ 25.022465] freed 128-byte region [ffff888015676200, ffff888015676280) [ 25.023780] [ 25.023970] The buggy address belongs to the physical page: [ 25.024563] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x15676 [ 25.025401] flags: 0xfffffe0000800(slab|node=0|zone=1|lastcpupid=0x3fffff) [ 25.026140] page_type: 0xffffffff() [ 25.026535] raw: 000fffffe0000800 ffff88800a0418c0 dead000000000122 0000000000000000 [ 25.027363] raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000 [ 25.028182] page dumped because: kasan: bad access detected [ 25.028773] [ 25.028957] Memory state around the buggy address: [ 25.029476] ffff888015676100: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 25.030244] ffff888015676180: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 25.031018] >ffff888015676200: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 25.031780] ^ [ 25.032221] ffff888015676280: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 25.032992] ffff888015676300: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 25.033769] ================================================================== [ 25.034571] Disabling lock debugging due to kernel taint " Could you take a look is it useful? Thanks! --- If you don't need the following environment to reproduce the problem or if you already have one reproduced environment, please ignore the following information. How to reproduce: git clone https://gitlab.com/xupengfe/repro_vm_env.git cd repro_vm_env tar -xvf repro_vm_env.tar.gz cd repro_vm_env; ./start3.sh // it needs qemu-system-x86_64 and I used v7.1.0 // start3.sh will load bzImage_2241ab53cbb5cdb08a6b2d4688feb13971058f65 v6.2-rc5 kernel // You could change the bzImage_xxx as you want // Maybe you need to remove line "-drive if=pflash,format=raw,readonly=on,file=./OVMF_CODE.fd \" for different qemu version You could use below command to log in, there is no password for root. ssh -p 10023 root@localhost After login vm(virtual machine) successfully, you could transfer reproduced binary to the vm by below way, and reproduce the problem in vm: gcc -pthread -o repro repro.c scp -P 10023 repro root@localhost:/root/ Get the bzImage for target kernel: Please use target kconfig and copy it to kernel_src/.config make olddefconfig make -jx bzImage //x should equal or less than cpu num your pc has Fill the bzImage file into above start3.sh to load the target kernel in vm. Tips: If you already have qemu-system-x86_64, please ignore below info. If you want to install qemu v7.1.0 version: git clone https://github.com/qemu/qemu.git cd qemu git checkout -f v7.1.0 mkdir build cd build yum install -y ninja-build.x86_64 yum -y install libslirp-devel.x86_64 ../configure --target-list=x86_64-softmmu --enable-kvm --enable-vnc --enable-gtk --enable-sdl --enable-usb-redir --enable-slirp make make install Best Regards, Thanks! > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 17843e66a1d3..2ea8ce59f582 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1607,6 +1607,11 @@ struct bpf_tracing_link { > struct bpf_prog *tgt_prog; > }; > > +struct bpf_raw_tp_link { > + struct bpf_link link; > + struct bpf_raw_event_map *btp; > +}; > + > struct bpf_link_primer { > struct bpf_link *link; > struct file *file; > diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h > index d68ff9b1247f..a7fc6fb6de3c 100644 > --- a/include/linux/trace_events.h > +++ b/include/linux/trace_events.h > @@ -759,8 +759,11 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); > int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); > void perf_event_detach_bpf_prog(struct perf_event *event); > int perf_event_query_prog_array(struct perf_event *event, void __user *info); > -int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); > -int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); > + > +struct bpf_raw_tp_link; > +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link); > +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link); > + > struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name); > void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp); > int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, > @@ -788,11 +791,12 @@ perf_event_query_prog_array(struct perf_event *event, void __user *info) > { > return -EOPNOTSUPP; > } > -static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *p) > +struct bpf_raw_tp_link; > +static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) > { > return -EOPNOTSUPP; > } > -static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *p) > +static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) > { > return -EOPNOTSUPP; > } > @@ -903,31 +907,31 @@ void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); > int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); > void perf_event_free_bpf_prog(struct perf_event *event); > > -void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); > -void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2); > -void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2, > +void bpf_trace_run1(struct bpf_raw_tp_link *link, u64 arg1); > +void bpf_trace_run2(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2); > +void bpf_trace_run3(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, > u64 arg3); > -void bpf_trace_run4(struct bpf_prog *prog, u64 arg1, u64 arg2, > +void bpf_trace_run4(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, > u64 arg3, u64 arg4); > -void bpf_trace_run5(struct bpf_prog *prog, u64 arg1, u64 arg2, > +void bpf_trace_run5(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, > u64 arg3, u64 arg4, u64 arg5); > -void bpf_trace_run6(struct bpf_prog *prog, u64 arg1, u64 arg2, > +void bpf_trace_run6(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, > u64 arg3, u64 arg4, u64 arg5, u64 arg6); > -void bpf_trace_run7(struct bpf_prog *prog, u64 arg1, u64 arg2, > +void bpf_trace_run7(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, > u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7); > -void bpf_trace_run8(struct bpf_prog *prog, u64 arg1, u64 arg2, > +void bpf_trace_run8(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, > u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, > u64 arg8); > -void bpf_trace_run9(struct bpf_prog *prog, u64 arg1, u64 arg2, > +void bpf_trace_run9(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, > u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, > u64 arg8, u64 arg9); > -void bpf_trace_run10(struct bpf_prog *prog, u64 arg1, u64 arg2, > +void bpf_trace_run10(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, > u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, > u64 arg8, u64 arg9, u64 arg10); > -void bpf_trace_run11(struct bpf_prog *prog, u64 arg1, u64 arg2, > +void bpf_trace_run11(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, > u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, > u64 arg8, u64 arg9, u64 arg10, u64 arg11); > -void bpf_trace_run12(struct bpf_prog *prog, u64 arg1, u64 arg2, > +void bpf_trace_run12(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, > u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, > u64 arg8, u64 arg9, u64 arg10, u64 arg11, u64 arg12); > void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, > diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h > index e609cd7da47e..a2ea11cc912e 100644 > --- a/include/trace/bpf_probe.h > +++ b/include/trace/bpf_probe.h > @@ -46,8 +46,7 @@ > static notrace void \ > __bpf_trace_##call(void *__data, proto) \ > { \ > - struct bpf_prog *prog = __data; \ > - CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(prog, CAST_TO_U64(args)); \ > + CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(__data, CAST_TO_U64(args)); \ > } > > #undef DECLARE_EVENT_CLASS > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index ae2ff73bde7e..1cb4c3809af4 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -3469,17 +3469,12 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, > return err; > } > > -struct bpf_raw_tp_link { > - struct bpf_link link; > - struct bpf_raw_event_map *btp; > -}; > - > static void bpf_raw_tp_link_release(struct bpf_link *link) > { > struct bpf_raw_tp_link *raw_tp = > container_of(link, struct bpf_raw_tp_link, link); > > - bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog); > + bpf_probe_unregister(raw_tp->btp, raw_tp); > bpf_put_raw_tracepoint(raw_tp->btp); > } > > @@ -3833,7 +3828,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, > goto out_put_btp; > } > > - err = bpf_probe_register(link->btp, prog); > + err = bpf_probe_register(link->btp, link); > if (err) { > bpf_link_cleanup(&link_primer); > goto out_put_btp; > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c > index 30ecf62f8a17..17de91ad4a1f 100644 > --- a/kernel/trace/bpf_trace.c > +++ b/kernel/trace/bpf_trace.c > @@ -2366,8 +2366,10 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) > } > > static __always_inline > -void __bpf_trace_run(struct bpf_prog *prog, u64 *args) > +void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) > { > + struct bpf_prog *prog = link->link.prog; > + > cant_sleep(); > if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { > bpf_prog_inc_misses_counter(prog); > @@ -2404,12 +2406,12 @@ void __bpf_trace_run(struct bpf_prog *prog, u64 *args) > #define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 > > #define BPF_TRACE_DEFN_x(x) \ > - void bpf_trace_run##x(struct bpf_prog *prog, \ > + void bpf_trace_run##x(struct bpf_raw_tp_link *link, \ > REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \ > { \ > u64 args[x]; \ > REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \ > - __bpf_trace_run(prog, args); \ > + __bpf_trace_run(link, args); \ > } \ > EXPORT_SYMBOL_GPL(bpf_trace_run##x) > BPF_TRACE_DEFN_x(1); > @@ -2425,9 +2427,10 @@ BPF_TRACE_DEFN_x(10); > BPF_TRACE_DEFN_x(11); > BPF_TRACE_DEFN_x(12); > > -int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) > +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) > { > struct tracepoint *tp = btp->tp; > + struct bpf_prog *prog = link->link.prog; > > /* > * check that program doesn't access arguments beyond what's > @@ -2439,13 +2442,12 @@ int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) > if (prog->aux->max_tp_access > btp->writable_size) > return -EINVAL; > > - return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, > - prog); > + return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, link); > } > > -int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) > +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) > { > - return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); > + return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, link); > } > > int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, > -- > 2.43.0 >