Hi, On 9/7/2023 9:04 PM, Toke Høiland-Jørgensen wrote: > Kumar Kartikeya Dwivedi <memxor@xxxxxxxxx> writes: > >> On Thu, 7 Sept 2023 at 12:26, Toke Høiland-Jørgensen <toke@xxxxxxxxxx> wrote: >>> +Arnaldo >>> >>>> Hi, >>>> >>>> Our bpf fuzzer, a customized Syzkaller, triggered a lockdep warning in >>>> the bpf queue map in v5.15. Since queue_stack_maps.c has no major changes >>>> since v5.15, we think this should still exist in the latest kernel. >>>> The bug can be occasionally triggered, and we suspect one of the >>>> eBPF programs involved to be the following one. We also attached the lockdep >>>> warning at the end. >>>> >>>> #define DEFINE_BPF_MAP_NO_KEY(the_map, TypeOfMap, MapFlags, >>>> TypeOfValue, MaxEntries) \ >>>> struct { \ >>>> __uint(type, TypeOfMap); \ >>>> __uint(map_flags, (MapFlags)); \ >>>> __uint(max_entries, (MaxEntries)); \ >>>> __type(value, TypeOfValue); \ >>>> } the_map SEC(".maps"); >>>> >>>> DEFINE_BPF_MAP_NO_KEY(map_0, BPF_MAP_TYPE_QUEUE, 0 | BPF_F_WRONLY, >>>> struct_0, 162); >>>> SEC("perf_event") >>>> int func(struct bpf_perf_event_data *ctx) { >>>> char v0[96] = {}; >>>> uint64_t v1 = 0; >>>> v1 = bpf_map_pop_elem(&map_0, v0); >>>> return 163819661; >>>> } >>>> >>>> >>>> The program is attached to the following perf event. >>>> >>>> struct perf_event_attr attr_type_hw = { >>>> .type = PERF_TYPE_HARDWARE, >>>> .config = PERF_COUNT_HW_CPU_CYCLES, >>>> .sample_freq = 50, >>>> .inherit = 1, >>>> .freq = 1, >>>> }; >>>> >>>> ================================WARNING: inconsistent lock state >>>> 5.15.26+ #2 Not tainted >>>> -------------------------------- >>>> inconsistent {INITIAL USE} -> {IN-NMI} usage. >>>> syz-executor.5/19749 [HC1[1]:SC0[0]:HE0:SE1] takes: >>>> ffff88804c9fc198 (&qs->lock){..-.}-{2:2}, at: __queue_map_get+0x31/0x250 >>>> {INITIAL USE} state was registered at: >>>> lock_acquire+0x1a3/0x4b0 >>>> _raw_spin_lock_irqsave+0x48/0x60 >>>> __queue_map_get+0x31/0x250 >>>> bpf_prog_577904e86c81dead_func+0x12/0x4b4 >>>> trace_call_bpf+0x262/0x5d0 >>>> perf_trace_run_bpf_submit+0x91/0x1c0 >>>> perf_trace_sched_switch+0x46c/0x700 >>>> __schedule+0x11b5/0x24a0 >>>> schedule+0xd4/0x270 >>>> futex_wait_queue_me+0x25f/0x520 >>>> futex_wait+0x1e0/0x5f0 >>>> do_futex+0x395/0x1890 >>>> __x64_sys_futex+0x1cb/0x480 >>>> do_syscall_64+0x3b/0xc0 >>>> entry_SYSCALL_64_after_hwframe+0x44/0xae >>>> irq event stamp: 13640 >>>> hardirqs last enabled at (13639): [<ffffffff95eb2bf4>] >>>> _raw_spin_unlock_irq+0x24/0x40 >>>> hardirqs last disabled at (13640): [<ffffffff95eb2d4d>] >>>> _raw_spin_lock_irqsave+0x5d/0x60 >>>> softirqs last enabled at (13464): [<ffffffff93e26de5>] __sys_bpf+0x3e15/0x4e80 >>>> softirqs last disabled at (13462): [<ffffffff93e26da3>] __sys_bpf+0x3dd3/0x4e80 >>>> >>>> other info that might help us debug this: >>>> Possible unsafe locking scenario: >>>> >>>> CPU0 >>>> ---- >>>> lock(&qs->lock); >>>> <Interrupt> >>>> lock(&qs->lock); >>> Hmm, so that lock() uses raw_spin_lock_irqsave(), which *should* be >>> disabling interrupts entirely for the critical section. But I guess a >>> Perf hardware event can still trigger? Which seems like it would >>> potentially wreak havoc with lots of things, not just this queue map >>> function? >>> >>> No idea how to protect against this, though. Hoping Arnaldo knows? :) >>> It seems my reply from last night was dropped by mail-list. >> The locking should probably be protected by a percpu integer counter, >> incremented and decremented before and after the lock is taken, >> respectively. If it is already non-zero, then -EBUSY should be >> returned. It is similar to what htab_lock_bucket protects against in >> hashtab.c. > Ah, neat! Okay, seems straight-forward enough to replicate. Hsin, could > you please check if the patch below gets rid of the splat? The fixes could fix the potential dead-lock, but I think the lockdep warning will be there, because lockdep thinks it is only safe to call try_lock under NMI-contex. So using raw_spin_trylock() for NMI context will both fix the potential dead-lock and the lockdep splat. > > -Toke > > > diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c > index 8d2ddcb7566b..f96945311eec 100644 > --- a/kernel/bpf/queue_stack_maps.c > +++ b/kernel/bpf/queue_stack_maps.c > @@ -16,6 +16,7 @@ > struct bpf_queue_stack { > struct bpf_map map; > raw_spinlock_t lock; > + int __percpu *map_locked; > u32 head, tail; > u32 size; /* max_entries + 1 */ > > @@ -66,6 +67,7 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) > int numa_node = bpf_map_attr_numa_node(attr); > struct bpf_queue_stack *qs; > u64 size, queue_size; > + int err = -ENOMEM; > > size = (u64) attr->max_entries + 1; > queue_size = sizeof(*qs) + size * attr->value_size; > @@ -80,7 +82,18 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) > > raw_spin_lock_init(&qs->lock); > > + qs->map_locked = bpf_map_alloc_percpu(&qs->map, > + sizeof(*qs->map_locked), > + sizeof(*qs->map_locked), > + GFP_USER); > + if (!qs->map_locked) > + goto free_map; > + > return &qs->map; > + > +free_map: > + bpf_map_area_free(qs); > + return ERR_PTR(err); > } > > /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ > @@ -88,9 +101,37 @@ static void queue_stack_map_free(struct bpf_map *map) > { > struct bpf_queue_stack *qs = bpf_queue_stack(map); > > + free_percpu(qs->map_locked); > bpf_map_area_free(qs); > } > > +static inline int queue_stack_map_lock(struct bpf_queue_stack *qs, > + unsigned long *pflags) > +{ > + unsigned long flags; > + > + preempt_disable(); > + if (unlikely(__this_cpu_inc_return(*qs->map_locked) != 1)) { > + __this_cpu_dec(*qs->map_locked); > + preempt_enable(); > + return -EBUSY; > + } > + > + raw_spin_lock_irqsave(&qs->lock, flags); > + *pflags = flags; > + > + return 0; > +} > + > + > +static inline void queue_stack_map_unlock(struct bpf_queue_stack *qs, > + unsigned long flags) > +{ > + raw_spin_unlock_irqrestore(&qs->lock, flags); > + __this_cpu_dec(*qs->map_locked); > + preempt_enable(); > +} > + > static long __queue_map_get(struct bpf_map *map, void *value, bool delete) > { > struct bpf_queue_stack *qs = bpf_queue_stack(map); > @@ -98,7 +139,9 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete) > int err = 0; > void *ptr; > > - raw_spin_lock_irqsave(&qs->lock, flags); > + err = queue_stack_map_lock(qs, &flags); > + if (err) > + return err; > > if (queue_stack_map_is_empty(qs)) { > memset(value, 0, qs->map.value_size); > @@ -115,7 +158,7 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete) > } > > out: > - raw_spin_unlock_irqrestore(&qs->lock, flags); > + queue_stack_map_unlock(qs, flags); > return err; > } > > @@ -128,7 +171,9 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete) > void *ptr; > u32 index; > > - raw_spin_lock_irqsave(&qs->lock, flags); > + err = queue_stack_map_lock(qs, &flags); > + if (err) > + return err; > > if (queue_stack_map_is_empty(qs)) { > memset(value, 0, qs->map.value_size); > @@ -147,7 +192,7 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete) > qs->head = index; > > out: > - raw_spin_unlock_irqrestore(&qs->lock, flags); > + queue_stack_map_unlock(qs, flags); > return err; > } > > @@ -193,7 +238,9 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value, > if (flags & BPF_NOEXIST || flags > BPF_EXIST) > return -EINVAL; > > - raw_spin_lock_irqsave(&qs->lock, irq_flags); > + err = queue_stack_map_lock(qs, &irq_flags); > + if (err) > + return err; > > if (queue_stack_map_is_full(qs)) { > if (!replace) { > @@ -212,7 +259,7 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value, > qs->head = 0; > > out: > - raw_spin_unlock_irqrestore(&qs->lock, irq_flags); > + queue_stack_map_unlock(qs, irq_flags); > return err; > } > > > .