Hello, We are developing a tool to perform static analysis on the bpf subsystem to detect locking violations. Our tool reported the spin_lock_irqsave() in trie_delete_elem() and trie_update_elem() that could be called from an NMI. If a bpf program holding the lock is interrupted by the same program in NMI, a deadlock can happen. The report was generated for kernel version 6.6-rc4, however, we believe this should still exist in the latest kernel. We tried to validate the report on v6.7 and v5.15 kernels by running a PoC and found that trie->lock is not the only problematic lock. Lockdep also complained about memcg_stock.stock_lock used in lpm_trie_node_alloc() and krc.lock used in kfree_rcu(). Therefore, I wonder if we should just return error when in NMI for trie_delete_elem() and trie_update_elem() assuming there is no such use case. Below is one of the splats and the PoC is attached at the end. I am also copying Priya who is developing the tool. Thanks, Amery ================================ WARNING: inconsistent lock state 5.15.26+ #42 Not tainted -------------------------------- inconsistent {INITIAL USE} -> {IN-NMI} usage. test_prog_user/262 [HC1[1]:SC0[0]:HE0:SE1] takes: ffff9ec37dc20fb0 (krc.lock){..-.}-{2:2}, at: kvfree_call_rcu+0x6d/0x330 {INITIAL USE} state was registered at: lock_acquire+0xc8/0x2d0 _raw_spin_lock_irqsave+0x48/0x60 kfree_rcu_scheduler_running+0x4c/0xa6 rcu_set_runtime_mode+0x1e/0x2b do_one_initcall+0x5b/0x2d0 kernel_init_freeable+0x28e/0x2f5 kernel_init+0x16/0x110 ret_from_fork+0x22/0x30 irq event stamp: 6852 hardirqs last enabled at (6851): [<ffffffffa3600d82>] asm_sysvec_apic_timer_interrupt+0x12/0x20 hardirqs last disabled at (6852): [<ffffffffa35d1e4b>] exc_nmi+0xab/0x180 softirqs last enabled at (6850): [<ffffffffa2a9afc9>] __irq_exit_rcu+0xb9/0xe0 softirqs last disabled at (6845): [<ffffffffa2a9afc9>] __irq_exit_rcu+0xb9/0xe0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(krc.lock); <Interrupt> lock(krc.lock); *** DEADLOCK *** no locks held by test_prog_user/262. stack backtrace: CPU: 0 PID: 262 Comm: test_prog_user Not tainted 5.15.26+ #42 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Call Trace: <TASK> dump_stack_lvl+0x57/0x72 lock_acquire.cold+0x43/0x48 ? kvfree_call_rcu+0x6d/0x330 _raw_spin_lock+0x2c/0x40 ? kvfree_call_rcu+0x6d/0x330 kvfree_call_rcu+0x6d/0x330 trie_delete_elem+0x198/0x200 bpf_prog_e10e7f0a82b8b6d9_trie_perf+0x70/0xefc bpf_overflow_handler+0xb3/0x1f0 __perf_event_overflow+0x52/0x100 handle_pmi_common+0x1f7/0x350 ? lock_acquire+0xc8/0x2d0 ? __lock_acquire+0x393/0x1d80 ? lock_is_held_type+0xa5/0x120 ? find_held_lock+0x2b/0x80 intel_pmu_handle_irq+0x119/0x2d0 ? nmi_handle+0x5/0x250 perf_event_nmi_handler+0x28/0x50 nmi_handle+0xce/0x250 default_do_nmi+0x40/0x120 exc_nmi+0x160/0x180 asm_exc_nmi+0x8e/0xd7 RIP: 0033:0x56257e1be3e8 Code: fd ff ff 48 89 85 68 ff ff ff 48 83 bd 68 ff ff ff 00 75 16 48 8d 05 7f 0c 00 00 48 89 c7 e8 1f fd ff ff b8 01 00 00 00 eb 02 <eb> fe 48 8b 55 f8 64 48 2b 14 25 28 00 00 00 74 05 e8 c2 fc ff ff RSP: 002b:00007ffd8b296f10 EFLAGS: 00000206 RAX: 000056257ff9a730 RBX: 00007ffd8b2970e8 RCX: 00007f6b04fe6b3b RDX: 0000000000000000 RSI: 0000000000002400 RDI: 0000000000000006 RBP: 00007ffd8b296fd0 R08: 0000000000000028 R09: 0000000600000005 R10: 0000000000000029 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffd8b297100 R14: 000056257e1c0d80 R15: 00007f6b05159020 </TASK> The lockdep warning can be triggered using the following user and bpf programs. ================================ #include <bpf/bpf.h> #include <bpf/libbpf.h> #include <linux/perf_event.h> #include <sys/syscall.h> #include <unistd.h> long perf_event_open(struct perf_event_attr* event_attr, pid_t pid, int cpu, int group_fd, unsigned long flags) { return syscall(__NR_perf_event_open, event_attr, pid, cpu, group_fd, flags); } int main(int argc, char **argv) { struct bpf_program* prog; struct bpf_object* obj; struct bpf_link* link; int ret, pfd; obj = bpf_object__open(argv[1]); if (!obj) { perror("bpf_object__open"); return 1; } ret = bpf_object__load(obj); if (ret) { perror("bpf_object__load"); return 1; } struct perf_event_attr attr_type_hw = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, .sample_freq = 50, .inherit = 1, .freq = 1, }; pfd = perf_event_open(&attr_type_hw, 0, -1, -1, 0); if (!pfd) { perror("perf_event_open"); return 1; } prog = bpf_object__next_program(obj, NULL); if (!prog) { perror("bpf_object__next_program"); return 1; } link = bpf_program__attach_perf_event(prog, pfd); if (!link) { perror("bpf_program__attach_perf_event"); return 1; } while (true) {}; return 0; } ============================== struct ipv4_lpm_key { __u32 prefixlen; __u32 data; }; struct { __uint(type, BPF_MAP_TYPE_LPM_TRIE); __type(key, struct ipv4_lpm_key); __type(value, __u32); __uint(map_flags, BPF_F_NO_PREALLOC); __uint(max_entries, 255); } pb SEC(".maps"); SEC("perf_event") int trie_perf(void *ctx) { struct ipv4_lpm_key key = { .prefixlen=2 , .data=2}; long init_val = 1; long *value; bpf_map_update_elem(&pb, &key, &init_val, BPF_ANY); value = bpf_map_lookup_elem(&pb, &key); int ret = bpf_map_delete_elem(&pb, &key); return 0; }