Hello, We are developing a tool to perform static analysis on the bpf subsystem to detect locking violations. Our tool reported the raw_spin_lock_irqsave() in bpf_common_push_free(). This function is used by htab_lru_map_update_elem() and htab_lru_map_delete_elem() which can be called from an NMI. A deadlock can happen if a bpf program holding the lock is interrupted by the same program in NMI. The report was generated for kernel version 6.6-rc4, however, we believe this should still exist in the latest kernel. We tried to validate the report on v6.7 by running a PoC. Below is the lockdep splat. The PoC is attached at the end. I am also copying Hsin-Wei who is involved in developing the tool. Thanks, Priya [ 698.417248] ================================ [ 698.417255] WARNING: inconsistent lock state [ 698.417258] 6.7.0-dirty #8 Not tainted [ 698.417265] -------------------------------- [ 698.417268] inconsistent {INITIAL USE} -> {IN-NMI} usage. [ 698.417273] lru_perf/1064 [HC1[1]:SC0[0]:HE0:SE1] takes: [ 698.417290] ffffe8fffc227ac0 (&loc_l->lock){....}-{2:2}, at: bpf_lru_pop_free+0x2fb/0x13a0 [ 698.417379] {INITIAL USE} state was registered at: [ 698.417384] lock_acquire+0x193/0x4c0 [ 698.417425] _raw_spin_lock_irqsave+0x3f/0x90 [ 698.417475] bpf_lru_pop_free+0x2fb/0x13a0 [ 698.417487] htab_lru_map_update_elem+0x16e/0xcb0 [ 698.417507] bpf_prog_47d4157ca618f90f_lru_tp+0x61/0xa1 [ 698.417522] trace_call_bpf+0x273/0x920 [ 698.417553] perf_trace_run_bpf_submit+0x8f/0x1c0 [ 698.417582] perf_trace_sched_switch+0x5c9/0x9c0 [ 698.417608] __traceiter_sched_switch+0x6f/0xc0 [ 698.417626] __schedule+0xae0/0x2ae0 [ 698.417647] __cond_resched+0x46/0x70 [ 698.417661] down_read+0x7f/0x350 [ 698.417677] kernfs_iop_permission+0xc2/0x130 [ 698.417715] inode_permission+0x38f/0x5f0 [ 698.417752] link_path_walk.part.0.constprop.0+0x821/0xcf0 [ 698.417773] path_lookupat+0x92/0x770 [ 698.417783] path_openat+0x1cc3/0x2690 [ 698.417794] do_filp_open+0x1c9/0x420 [ 698.417806] do_sys_openat2+0x164/0x1d0 [ 698.417830] __x64_sys_openat+0x140/0x1f0 [ 698.417850] do_syscall_64+0x46/0xf0 [ 698.417876] entry_SYSCALL_64_after_hwframe+0x6f/0x77 [ 698.417929] irq event stamp: 33026 [ 698.417933] hardirqs last enabled at (33025): [<ffffffff8480144a>] asm_sysvec_apic_timer_interrupt+0x1a/0x20 [ 698.417951] hardirqs last disabled at (33026): [<ffffffff8478ca89>] exc_nmi+0x159/0x200 [ 698.417966] softirqs last enabled at (33022): [<ffffffff847b5541>] __do_softirq+0x4e1/0x73e [ 698.417990] softirqs last disabled at (33015): [<ffffffff811ab473>] irq_exit_rcu+0x93/0xc0 [ 698.418013] [ 698.418013] other info that might help us debug this: [ 698.418020] Possible unsafe locking scenario: [ 698.418020] [ 698.418023] CPU0 [ 698.418025] ---- [ 698.418026] lock(&loc_l->lock); [ 698.418034] <Interrupt> [ 698.418035] lock(&loc_l->lock); [ 698.418042] [ 698.418042] *** DEADLOCK *** [ 698.418042] [ 698.418044] no locks held by lru_perf/1064. [ 698.418049] [ 698.418049] stack backtrace: [ 698.418057] CPU: 1 PID: 1064 Comm: lru_perf Not tainted 6.7.0-dirty #8 [ 698.418070] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [ 698.418078] Call Trace: [ 698.418091] <TASK> [ 698.418097] dump_stack_lvl+0x91/0xf0 [ 698.418129] lock_acquire+0x35b/0x4c0 [ 698.418150] ? __pfx_lock_acquire+0x10/0x10 [ 698.418169] ? bpf_lru_pop_free+0x2fb/0x13a0 [ 698.418185] ? trace_event_raw_event_bpf_trace_printk+0x14a/0x210 [ 698.418201] ? __pfx_trace_event_raw_event_bpf_trace_printk+0x10/0x10 [ 698.418216] ? bstr_printf+0x348/0xf40 [ 698.418243] _raw_spin_lock_irqsave+0x3f/0x90 [ 698.418258] ? bpf_lru_pop_free+0x2fb/0x13a0 [ 698.418273] bpf_lru_pop_free+0x2fb/0x13a0 [ 698.418292] ? __pfx_bpf_trace_printk+0x10/0x10 [ 698.418308] htab_lru_map_update_elem+0x16e/0xcb0 [ 698.418324] ? perf_prepare_sample+0x16b/0x2060 [ 698.418339] ? perf_event_update_userpage+0x4db/0x800 [ 698.418370] bpf_prog_73160903ac17fb89_lru_perf+0x61/0xa1 [ 698.418387] bpf_overflow_handler+0x184/0x4a0 [ 698.418404] ? __pfx_bpf_overflow_handler+0x10/0x10 [ 698.418425] __perf_event_overflow+0x4c2/0x9e0 [ 698.418445] handle_pmi_common+0x4d7/0x800 [ 698.418481] ? hlock_class+0x4e/0x140 [ 698.418496] ? __lock_acquire+0x150a/0x3b10 [ 698.418518] ? __pfx_handle_pmi_common+0x10/0x10 [ 698.418540] ? __hrtimer_run_queues+0x1ef/0xa00 [ 698.418576] ? hlock_class+0x4e/0x140 [ 698.418591] ? lock_release+0x587/0xaa0 [ 698.418611] ? __pfx_lock_release+0x10/0x10 [ 698.418632] ? hlock_class+0x4e/0x140 [ 698.418646] ? look_up_lock_class+0x56/0x140 [ 698.418664] ? lock_acquire+0x272/0x4c0 [ 698.418682] ? intel_bts_interrupt+0x115/0x3e0 [ 698.418707] intel_pmu_handle_irq+0x246/0xd90 [ 698.418729] perf_event_nmi_handler+0x4c/0x70 [ 698.418750] nmi_handle+0x1a6/0x520 [ 698.418785] default_do_nmi+0x64/0x1c0 [ 698.418801] exc_nmi+0x187/0x200 [ 698.418815] asm_exc_nmi+0xb6/0xff [ 698.418829] RIP: 0033:0x55f642e8460b [ 698.418840] Code: ff ff ff ff 48 8b 05 54 9a 04 00 48 89 c1 ba 2d 00 00 00 be 01 00 00 00 48 8d 05 e8 1b 03 00 48 89 c7 e8 d8 f6 ff ff eb 10 90 <0f> b6 05 37 9a 04 00 83 f0 01 84 c0 75 f2 90 48 83 bd 08 fe ff ff [ 698.418853] RSP: 002b:00007ffc89e99de0 EFLAGS: 00000202 [ 698.418879] RAX: 0000000000000001 RBX: 000055f6439a7038 RCX: 0000000000000000 [ 698.418888] RDX: 000000055f6439a6 RSI: 000055f6439a5010 RDI: 0000000000000007 [ 698.418897] RBP: 00007ffc89e9a010 R08: 000055f6439a6f60 R09: 000055f6439a52e0 [ 698.418906] R10: 0000000000000000 R11: b89c2540e1908856 R12: 00007ffc89e9a128 [ 698.418914] R13: 000055f642e84084 R14: 000055f642ecb9d8 R15: 00007f68b8e01040 [ 698.418933] </TASK> The lockdep warning can be triggered using the following user and bpf programs. ================================ #include <unistd.h> #include <sys/syscall.h> #include <linux/perf_event.h> #include <bpf/libbpf.h> #include <bpf/bpf.h> #include <sys/resource.h> #include <signal.h> #include "lru_perf.skel.h" static volatile bool exiting = false; static void sig_handler(int sig) { exiting = true; return; } extern int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz); static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { int ret; ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); return ret; } void bump_memlock_rlimit(void) { struct rlimit rlim_new = { .rlim_cur = RLIM_INFINITY, .rlim_max = RLIM_INFINITY, }; if (setrlimit(RLIMIT_MEMLOCK, &rlim_new)) { fprintf(stderr, "Failed to increase RLIMIT_MEMLOCK limit!\n"); exit(1); } return; } int main(int argc, char *const argv[]) { const char *online_cpus_file = "/sys/devices/system/cpu/online"; int cpu; struct lru_perf_bpf *skel = NULL; struct perf_event_attr attr; struct bpf_link **links = NULL; int num_cpus, num_online_cpus; int *pefds = NULL, pefd; int i, err = 0; bool *online_mask = NULL; struct bpf_program *prog; struct bpf_object *obj; struct bpf_map *map; char filename[256]; bump_memlock_rlimit(); signal(SIGINT, sig_handler); signal(SIGTERM, sig_handler); err = parse_cpu_mask_file(online_cpus_file, &online_mask, &num_online_cpus); if (err) { fprintf(stderr, "Fail to get online CPU numbers: %d\n", err); goto cleanup; } num_cpus = libbpf_num_possible_cpus(); if (num_cpus <= 0) { fprintf(stderr, "Fail to get the number of processors\n"); err = -1; goto cleanup; } snprintf(filename, sizeof(filename), ".output/lru_perf.bpf.o"); obj = bpf_object__open_file(filename, NULL); if (libbpf_get_error(obj)) { fprintf(stderr, "ERROR: opening BPF object file failed\n"); goto cleanup; } map = bpf_object__find_map_by_name(obj, "pb"); if (libbpf_get_error(map)) { fprintf(stderr, "ERROR: finding a map in obj file failed\n"); goto cleanup; } if (bpf_object__load(obj)) { fprintf(stderr, "ERROR: loading BPF object file failed\n"); goto cleanup; } pefds = malloc(num_cpus * sizeof(int)); for (i = 0; i < num_cpus; i++) { pefds[i] = -1; } links = calloc(num_cpus, sizeof(struct bpf_link *)); memset(&attr, 0, sizeof(attr)); attr.type = PERF_TYPE_HARDWARE; attr.config = PERF_COUNT_HW_CPU_CYCLES; attr.sample_freq = 10; attr.inherit = 1; attr.freq = 1; for (cpu = 0; cpu < 2; cpu++) { //skip offline/not present CPUs if (cpu >= num_online_cpus || !online_mask[cpu]) continue; // Set up performance monitoring on a CPU/Core pefd = perf_event_open(&attr, 0, -1, -1, 0); if (pefd < 0) { fprintf(stderr, "Fail to set up performance monitor on a CPU/Core\n"); err = -1; goto cleanup; } pefds[cpu] = pefd; prog = bpf_object__find_program_by_name(obj, "lru_perf"); if (!prog) { fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); goto cleanup; } links[cpu] = bpf_program__attach_perf_event(prog, pefds[cpu]); if (!links[cpu]) { err = -1; fprintf(stderr, "ERROR: bpf_program__attach failed\n"); goto cleanup; } } while(!exiting){ } cleanup: if (links) { for (cpu = 0; cpu < num_cpus; cpu++) bpf_link__destroy(links[cpu]); free(links); } if (pefds) { for (i = 0; i < num_cpus; i++) { if (pefds[i] >= 0) close(pefds[i]); } free(pefds); } lru_perf_bpf__destroy(skel); free(online_mask); return -err; } ============================== #include "vmlinux.h" #include <linux/version.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> #include <bpf/bpf_core_read.h> #define MAX_ENTRIES 1000 #define MAX_NR_CPUS 1024 #define TASK_COMM_LEN 16 #define MAX_FILENAME_LEN 512 struct { __uint(type, BPF_MAP_TYPE_LRU_HASH); __type(key, int); __type(value,int); __uint(max_entries, 255); } pb SEC(".maps"); SEC("perf_event") int lru_perf(void *ctx) { int key = 2; int init_val = 1; long *value; int i; bpf_printk("lru_perf"); bpf_map_update_elem(&pb, &key, &init_val, BPF_ANY); value = bpf_map_lookup_elem(&pb, &key); int ret = bpf_map_delete_elem(&pb, &key); return 0; }