Kumar Kartikeya Dwivedi <memxor@xxxxxxxxx> writes: > On Thu, 7 Sept 2023 at 12:26, Toke Høiland-Jørgensen <toke@xxxxxxxxxx> wrote: >> >> +Arnaldo >> >> > Hi, >> > >> > Our bpf fuzzer, a customized Syzkaller, triggered a lockdep warning in >> > the bpf queue map in v5.15. Since queue_stack_maps.c has no major changes >> > since v5.15, we think this should still exist in the latest kernel. >> > The bug can be occasionally triggered, and we suspect one of the >> > eBPF programs involved to be the following one. We also attached the lockdep >> > warning at the end. >> > >> > #define DEFINE_BPF_MAP_NO_KEY(the_map, TypeOfMap, MapFlags, >> > TypeOfValue, MaxEntries) \ >> > struct { \ >> > __uint(type, TypeOfMap); \ >> > __uint(map_flags, (MapFlags)); \ >> > __uint(max_entries, (MaxEntries)); \ >> > __type(value, TypeOfValue); \ >> > } the_map SEC(".maps"); >> > >> > DEFINE_BPF_MAP_NO_KEY(map_0, BPF_MAP_TYPE_QUEUE, 0 | BPF_F_WRONLY, >> > struct_0, 162); >> > SEC("perf_event") >> > int func(struct bpf_perf_event_data *ctx) { >> > char v0[96] = {}; >> > uint64_t v1 = 0; >> > v1 = bpf_map_pop_elem(&map_0, v0); >> > return 163819661; >> > } >> > >> > >> > The program is attached to the following perf event. >> > >> > struct perf_event_attr attr_type_hw = { >> > .type = PERF_TYPE_HARDWARE, >> > .config = PERF_COUNT_HW_CPU_CYCLES, >> > .sample_freq = 50, >> > .inherit = 1, >> > .freq = 1, >> > }; >> > >> > ================================WARNING: inconsistent lock state >> > 5.15.26+ #2 Not tainted >> > -------------------------------- >> > inconsistent {INITIAL USE} -> {IN-NMI} usage. >> > syz-executor.5/19749 [HC1[1]:SC0[0]:HE0:SE1] takes: >> > ffff88804c9fc198 (&qs->lock){..-.}-{2:2}, at: __queue_map_get+0x31/0x250 >> > {INITIAL USE} state was registered at: >> > lock_acquire+0x1a3/0x4b0 >> > _raw_spin_lock_irqsave+0x48/0x60 >> > __queue_map_get+0x31/0x250 >> > bpf_prog_577904e86c81dead_func+0x12/0x4b4 >> > trace_call_bpf+0x262/0x5d0 >> > perf_trace_run_bpf_submit+0x91/0x1c0 >> > perf_trace_sched_switch+0x46c/0x700 >> > __schedule+0x11b5/0x24a0 >> > schedule+0xd4/0x270 >> > futex_wait_queue_me+0x25f/0x520 >> > futex_wait+0x1e0/0x5f0 >> > do_futex+0x395/0x1890 >> > __x64_sys_futex+0x1cb/0x480 >> > do_syscall_64+0x3b/0xc0 >> > entry_SYSCALL_64_after_hwframe+0x44/0xae >> > irq event stamp: 13640 >> > hardirqs last enabled at (13639): [<ffffffff95eb2bf4>] >> > _raw_spin_unlock_irq+0x24/0x40 >> > hardirqs last disabled at (13640): [<ffffffff95eb2d4d>] >> > _raw_spin_lock_irqsave+0x5d/0x60 >> > softirqs last enabled at (13464): [<ffffffff93e26de5>] __sys_bpf+0x3e15/0x4e80 >> > softirqs last disabled at (13462): [<ffffffff93e26da3>] __sys_bpf+0x3dd3/0x4e80 >> > >> > other info that might help us debug this: >> > Possible unsafe locking scenario: >> > >> > CPU0 >> > ---- >> > lock(&qs->lock); >> > <Interrupt> >> > lock(&qs->lock); >> >> Hmm, so that lock() uses raw_spin_lock_irqsave(), which *should* be >> disabling interrupts entirely for the critical section. But I guess a >> Perf hardware event can still trigger? Which seems like it would >> potentially wreak havoc with lots of things, not just this queue map >> function? >> >> No idea how to protect against this, though. Hoping Arnaldo knows? :) >> > > The locking should probably be protected by a percpu integer counter, > incremented and decremented before and after the lock is taken, > respectively. If it is already non-zero, then -EBUSY should be > returned. It is similar to what htab_lock_bucket protects against in > hashtab.c. Ah, neat! Okay, seems straight-forward enough to replicate. Hsin, could you please check if the patch below gets rid of the splat? -Toke diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 8d2ddcb7566b..f96945311eec 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -16,6 +16,7 @@ struct bpf_queue_stack { struct bpf_map map; raw_spinlock_t lock; + int __percpu *map_locked; u32 head, tail; u32 size; /* max_entries + 1 */ @@ -66,6 +67,7 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) int numa_node = bpf_map_attr_numa_node(attr); struct bpf_queue_stack *qs; u64 size, queue_size; + int err = -ENOMEM; size = (u64) attr->max_entries + 1; queue_size = sizeof(*qs) + size * attr->value_size; @@ -80,7 +82,18 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&qs->lock); + qs->map_locked = bpf_map_alloc_percpu(&qs->map, + sizeof(*qs->map_locked), + sizeof(*qs->map_locked), + GFP_USER); + if (!qs->map_locked) + goto free_map; + return &qs->map; + +free_map: + bpf_map_area_free(qs); + return ERR_PTR(err); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -88,9 +101,37 @@ static void queue_stack_map_free(struct bpf_map *map) { struct bpf_queue_stack *qs = bpf_queue_stack(map); + free_percpu(qs->map_locked); bpf_map_area_free(qs); } +static inline int queue_stack_map_lock(struct bpf_queue_stack *qs, + unsigned long *pflags) +{ + unsigned long flags; + + preempt_disable(); + if (unlikely(__this_cpu_inc_return(*qs->map_locked) != 1)) { + __this_cpu_dec(*qs->map_locked); + preempt_enable(); + return -EBUSY; + } + + raw_spin_lock_irqsave(&qs->lock, flags); + *pflags = flags; + + return 0; +} + + +static inline void queue_stack_map_unlock(struct bpf_queue_stack *qs, + unsigned long flags) +{ + raw_spin_unlock_irqrestore(&qs->lock, flags); + __this_cpu_dec(*qs->map_locked); + preempt_enable(); +} + static long __queue_map_get(struct bpf_map *map, void *value, bool delete) { struct bpf_queue_stack *qs = bpf_queue_stack(map); @@ -98,7 +139,9 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete) int err = 0; void *ptr; - raw_spin_lock_irqsave(&qs->lock, flags); + err = queue_stack_map_lock(qs, &flags); + if (err) + return err; if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); @@ -115,7 +158,7 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete) } out: - raw_spin_unlock_irqrestore(&qs->lock, flags); + queue_stack_map_unlock(qs, flags); return err; } @@ -128,7 +171,9 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete) void *ptr; u32 index; - raw_spin_lock_irqsave(&qs->lock, flags); + err = queue_stack_map_lock(qs, &flags); + if (err) + return err; if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); @@ -147,7 +192,7 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete) qs->head = index; out: - raw_spin_unlock_irqrestore(&qs->lock, flags); + queue_stack_map_unlock(qs, flags); return err; } @@ -193,7 +238,9 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value, if (flags & BPF_NOEXIST || flags > BPF_EXIST) return -EINVAL; - raw_spin_lock_irqsave(&qs->lock, irq_flags); + err = queue_stack_map_lock(qs, &irq_flags); + if (err) + return err; if (queue_stack_map_is_full(qs)) { if (!replace) { @@ -212,7 +259,7 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value, qs->head = 0; out: - raw_spin_unlock_irqrestore(&qs->lock, irq_flags); + queue_stack_map_unlock(qs, irq_flags); return err; }